1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24
25 import numpy
26 from pytools import memoize_method, memoize, Record
27 import pycuda.driver as cuda
28 import pycuda.gpuarray as gpuarray
29 from pycuda.compiler import SourceModule
30 import hedge.backends.cuda.plan
31 from pymbolic.mapper.c_code import CCodeMapper
32 from hedge.flux import FluxIdentityMapper
33
34 from codepy.cgen import \
35 Pointer, POD, Value, ArrayOf, Typedef, \
36 Module, FunctionDeclaration, FunctionBody, Block, \
37 Comment, Line, Include, \
38 MaybeUnused, \
39 Define, \
40 Initializer, If, For, Statement, Assign, While
41
42 from pymbolic.mapper.stringifier import PREC_NONE
48
55 from codepy.cgen import GenerableStruct, POD, ArrayOf
56 return GenerableStruct("face_pair", [
57 POD(float_type, "h", ),
58 POD(float_type, "order"),
59 POD(float_type, "face_jacobian"),
60 ArrayOf(POD(float_type, "normal"), dims),
61
62 POD(numpy.uint32, "a_base"),
63 POD(numpy.uint32, "b_base"),
64
65 POD(numpy.uint16, "a_ilist_index"),
66 POD(numpy.uint16, "b_ilist_index"),
67 POD(numpy.uint16, "b_write_ilist_index"),
68 POD(numpy.uint16, "boundary_bitmap"),
69 POD(numpy.uint16, "a_dest"),
70 POD(numpy.uint16, "b_dest"),
71 ],
72
73
74
75 aligned_prime_to=[2],
76 )
77
80 from codepy.cgen import GenerableStruct, POD
81
82 return GenerableStruct("flux_header", [
83 POD(numpy.uint16, "same_facepairs_end"),
84 POD(numpy.uint16, "diff_facepairs_end"),
85 POD(numpy.uint16, "bdry_facepairs_end"),
86 ], align_bytes=face_pair_struct(float_type, dims).alignment_requirement())
87
93 - def __init__(self, int_field_expr, ext_field_expr, dep_to_index):
94 self.int_field_expr = int_field_expr
95 self.ext_field_expr = ext_field_expr
96 self.dep_to_index = dep_to_index
97
99 if expr.is_local:
100 prefix = "a"
101 f_expr = self.int_field_expr
102 else:
103 prefix = "b"
104 f_expr = self.ext_field_expr
105
106 from hedge.tools import is_obj_array, is_zero
107 from pymbolic import var
108 if is_obj_array(f_expr):
109 f_expr = f_expr[expr.index]
110 if is_zero(f_expr):
111 return 0
112 return var("val_%s_field%d" % (prefix, self.dep_to_index[f_expr]))
113 else:
114 assert expr.index == 0, repr(f_expr)
115 if is_zero(f_expr):
116 return 0
117 return var("val_%s_field%d" % (prefix, self.dep_to_index[f_expr]))
118
124 self.dtype = dtype
125
126 def float_mapper(x):
127 if isinstance(x, float):
128 return "value_type(%s)" % repr(x)
129 else:
130 return repr(x)
131
132 CCodeMapper.__init__(self, float_mapper, reverse=False)
133
135 return "fpair->normal[%d]" % expr.axis
136
138 return ("pow(fpair->order*fpair->order/fpair->h, %r)"
139 % expr.power)
140
159
160
161
162
163
164 -def flux_to_code(f2cm, is_flipped, int_field_expr, ext_field_expr,
165 dep_to_index, flux, prec):
171
172
173
174
175
176 -class ExecutionPlan(hedge.backends.cuda.plan.ExecutionPlan):
177 - def __init__(self, given,
178 parallel_faces, mbs_per_block, flux_count,
179 direct_store, partition_data):
180 hedge.backends.cuda.plan.ExecutionPlan.__init__(self, given)
181 self.parallel_faces = parallel_faces
182 self.mbs_per_block = mbs_per_block
183 self.flux_count = flux_count
184 self.direct_store = direct_store
185
186 self.partition_data = partition_data
187
189 return self.mbs_per_block
190
193
196
197 @memoize_method
199 return self.partition_data.max_face_pair_count
200
201 @memoize_method
222
232
235
238
240 result = ("%s pfaces=%d mbs_per_block=%d mb_elements=%d" % (
241 hedge.backends.cuda.plan.ExecutionPlan.__str__(self),
242 self.parallel_faces,
243 self.mbs_per_block,
244 self.given.microblock.elements,
245 ))
246
247 if self.direct_store:
248 result += " direct_store"
249
250 return result
251
254
255
256
257
258
259 -def make_plan(discr, given, tune_for):
260 from hedge.backends.cuda.execute import Executor
261 if tune_for is not None:
262 fbatch1 = Executor.get_first_flux_batch(discr.mesh, tune_for)
263 if fbatch1 is not None:
264 fluxes = list(fbatch1.flux_exprs)
265 flux_count = len(fluxes)
266 else:
267 fluxes = None
268 else:
269 fluxes = None
270
271 if fluxes is None:
272
273 flux_count = discr.dimensions
274
275 def generate_valid_plans():
276 valid_plan_count = 0
277 for direct_store in [False, True]:
278 for parallel_faces in range(1, 32):
279 for mbs_per_block in range(1, 8):
280 flux_plan = ExecutionPlan(given, parallel_faces,
281 mbs_per_block, flux_count,
282 direct_store=direct_store,
283 partition_data=discr._get_partition_data(
284 mbs_per_block*given.microblock.elements))
285 if flux_plan.invalid_reason() is None:
286 valid_plan_count += 1
287 yield flux_plan
288
289
290
291
292 if valid_plan_count and given.float_type == numpy.float32:
293 return
294
295 def target_func(plan):
296 if tune_for is None:
297 return 0
298 else:
299 return plan.make_kernel(discr, executor=None,
300 fluxes=fluxes).benchmark()
301
302 from hedge.backends.cuda.plan import optimize_plan
303
304 return optimize_plan(
305 "gather",
306 generate_valid_plans, target_func,
307 maximize=False,
308 debug_flags=discr.debug)
309
315 - def __init__(self, discr, plan, executor, fluxes):
316 self.discr = discr
317 self.plan = plan
318 self.executor = executor
319
320 assert isinstance(fluxes, list)
321 self.fluxes = fluxes
322
323 interior_deps_set = set()
324 boundary_int_deps_set = set()
325 boundary_ext_deps_set = set()
326 self.dep_to_tag = {}
327 for f in fluxes:
328 interior_deps_set.update(set(f.interior_deps))
329 boundary_int_deps_set.update(set(f.boundary_int_deps))
330 boundary_ext_deps_set.update(set(f.boundary_ext_deps))
331 self.dep_to_tag.update(f.dep_to_tag)
332
333 self.interior_deps = list(interior_deps_set)
334 self.boundary_int_deps = list(boundary_int_deps_set)
335 self.boundary_ext_deps = list(boundary_ext_deps_set)
336 self.all_deps = list(
337 interior_deps_set
338 | boundary_int_deps_set
339 | boundary_ext_deps_set
340 )
341
342 self.dep_to_index = dict((dep, i) for i, dep in enumerate(self.all_deps))
343
345 discr = self.discr
346 given = self.plan.given
347
348 from hedge.backends.cuda.tools import int_ceiling
349 block_count = int_ceiling(
350 len(discr.mesh.elements)/self.plan.elements_per_block())
351 all_fluxes_on_faces = [gpuarray.empty(
352 (block_count * self.plan.microblocks_per_block()
353 * given.aligned_face_dofs_per_microblock(),),
354 dtype=given.float_type,
355 allocator=discr.pool.allocate)
356 for i in range(len(self.fluxes))]
357
358 field = gpuarray.empty(
359 (self.plan.dofs_per_block() * block_count,),
360 dtype=given.float_type,
361 allocator=discr.pool.allocate)
362
363 fdata = self.fake_flux_face_data_block(block_count)
364 ilist_data = self.fake_index_list_data()
365
366 gather, texref_map = self.get_kernel(fdata, ilist_data,
367 for_benchmark=True)
368
369 for dep_expr in self.all_deps:
370 field.bind_to_texref_ext(texref_map[dep_expr],
371 allow_double_hack=True)
372
373 if "cuda_fastbench" in discr.debug:
374 count = 1
375 else:
376 count = 20
377
378 start = cuda.Event()
379 start.record()
380 for i in range(count):
381 try:
382 gather.prepared_call(
383 (block_count, 1),
384 0,
385 fdata.device_memory,
386 *tuple(fof.gpudata for fof in all_fluxes_on_faces)
387 )
388 except cuda.LaunchError:
389 return None
390
391 stop = cuda.Event()
392 stop.record()
393 stop.synchronize()
394
395 return 1e-3/count * stop.time_since(start)
396
397 - def __call__(self, eval_dependency, lift_plan):
398 discr = self.discr
399 given = self.plan.given
400 elgroup, = discr.element_groups
401
402 all_fluxes_on_faces = [gpuarray.empty(
403 given.matmul_preimage_shape(lift_plan),
404 dtype=given.float_type,
405 allocator=discr.pool.allocate)
406 for i in range(len(self.fluxes))]
407
408 fdata = self.flux_face_data_block(elgroup)
409 ilist_data = self.index_list_data()
410
411 gather, texref_map = self.get_kernel(fdata, ilist_data,
412 for_benchmark=False)
413
414 for dep_expr in self.all_deps:
415 dep_field = eval_dependency(dep_expr)
416
417 from hedge.tools import is_zero
418 if is_zero(dep_field):
419 if dep_expr in self.dep_to_tag:
420 dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr])
421 else:
422 dep_field = discr.volume_zeros()
423
424 assert dep_field.dtype == given.float_type
425 dep_field.bind_to_texref_ext(texref_map[dep_expr],
426 allow_double_hack=True)
427
428 if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
429 debugbuf = gpuarray.zeros((10000,), dtype=given.float_type)
430 else:
431 from hedge.backends.cuda.tools import FakeGPUArray
432 debugbuf = FakeGPUArray()
433
434 if discr.instrumented:
435 discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call(
436 (len(discr.blocks), 1),
437 debugbuf.gpudata,
438 fdata.device_memory,
439 *tuple(fof.gpudata for fof in all_fluxes_on_faces)
440 ))
441
442 discr.gmem_bytes_gather.add(
443 len(discr.blocks) * fdata.block_bytes
444 +
445 given.float_size()
446 * (
447
448 len(self.fluxes)
449 * 2*fdata.fp_count
450 * given.dofs_per_face()
451
452
453 + len(discr.blocks)
454 * len(self.fluxes)
455 * self.plan.microblocks_per_block()
456 * given.aligned_face_dofs_per_microblock()
457 ))
458 else:
459 gather.prepared_call(
460 (len(discr.blocks), 1),
461 debugbuf.gpudata,
462 fdata.device_memory,
463 *tuple(fof.gpudata for fof in all_fluxes_on_faces)
464 )
465
466 if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
467 from hedge.tools import get_rank, wait_for_keypress
468 if get_rank(discr) == 0:
469 copied_debugbuf = debugbuf.get()
470 print "DEBUG", len(discr.blocks)
471 numpy.set_printoptions(linewidth=130)
472
473
474
475 for i in range(len(discr.blocks)*6):
476 print i, copied_debugbuf[i*16:(i+1)*16]
477
478
479 wait_for_keypress(discr)
480
481 if "cuda_flux" in discr.debug:
482 from hedge.tools import get_rank, wait_for_keypress
483 if get_rank(discr) == 0:
484 numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6)
485 if True:
486
487 cols = []
488 for k in range(len(all_fluxes_on_faces)):
489 my_fof = all_fluxes_on_faces[k].get()
490 def sstruc(a):
491 result = ""
492 for i in a:
493 if i == 0:
494 result += "0"
495 elif abs(i) < 1e-10:
496 result += "-"
497 elif numpy.isnan(i):
498 result += "N"
499 elif i == 17:
500 result += "*"
501 else:
502 result += "#"
503
504 return result
505
506 useful_sz = given.block_count \
507 * given.microblocks_per_block \
508 * lift_plan.aligned_preimage_dofs_per_microblock
509
510 my_col = []
511 i = 0
512 while i < useful_sz:
513 my_col.append(sstruc(my_fof[i:i+16]))
514 i += 16
515
516 cols.append(my_col)
517
518 from pytools import Table
519 tbl = Table()
520 tbl.add_row(["num"]+range(len(cols)))
521 i = 0
522 for row in zip(*cols):
523 tbl.add_row((i,)+row)
524 i += 1
525 print tbl
526 else:
527 for i in range(len(all_fluxes_on_faces)):
528 print i
529 print all_fluxes_on_faces[i].get()
530
531 wait_for_keypress(discr)
532
533
534 return all_fluxes_on_faces
535
537 if self.plan.direct_store:
538 return Assign(
539 "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE + %s]" % (flux_nr, index),
540 what)
541 else:
542 return Assign(
543 "smem_fluxes_on_faces[%d][%s]" % (flux_nr, index),
544 what)
545
547 given = self.plan.given
548
549 def get_field(flux_rec, is_interior, flipped):
550 if is_interior ^ flipped:
551 prefix = "a"
552 else:
553 prefix = "b"
554
555 return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr]))
556
557 flux_write_code = Block([])
558
559 flux_var_decl = [Initializer( POD(given.float_type, "a_flux"), 0)]
560
561 if is_twosided:
562 flux_var_decl.append(Initializer(POD(given.float_type, "b_flux"), 0))
563 prefixes = ["a", "b"]
564 flip_values = [False, True]
565 else:
566 prefixes = ["a"]
567 flip_values = [False]
568
569 flux_write_code.append(Line())
570
571 for dep in self.interior_deps:
572 for side in ["a", "b"]:
573 flux_write_code.append(
574 Initializer(
575 MaybeUnused(POD(given.float_type, "val_%s_field%d"
576 % (side, self.dep_to_index[dep]))),
577 "fp_tex1Dfetch(field%d_tex, %s_index)"
578 % (self.dep_to_index[dep], side)))
579
580 f2cm = FluxToCodeMapper(given.float_type)
581
582 flux_sub_codes = []
583 for flux_nr, wdflux in enumerate(self.fluxes):
584 my_flux_block = Block(flux_var_decl)
585
586 for int_rec in wdflux.interiors:
587 for prefix, is_flipped in zip(prefixes, flip_values):
588 my_flux_block.append(
589 Statement("%s_flux += %s"
590 % (prefix,
591 flux_to_code(f2cm, is_flipped,
592 int_rec.field_expr,
593 int_rec.field_expr,
594 self.dep_to_index,
595 int_rec.flux_expr, PREC_NONE),
596 )))
597
598 my_flux_block.append(Line())
599
600 my_flux_block.append(
601 self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
602 "fpair->face_jacobian*a_flux"))
603
604
605
606
607
608 if is_twosided:
609 my_flux_block.append(
610 self.gen_store(flux_nr,
611 "fpair->b_dest+tex1Dfetch(tex_index_lists, "
612 "fpair->b_write_ilist_index + FACEDOF_NR)",
613 "fpair->face_jacobian*b_flux"))
614
615
616
617
618
619 flux_sub_codes.append(my_flux_block)
620
621 flux_write_code.extend(
622 Initializer(
623 Value("value_type", f2cm.cse_prefix+str(i)), cse)
624 for i, cse in f2cm.cses)
625 flux_write_code.extend(flux_sub_codes)
626
627 return flux_write_code
628
630 given = self.plan.given
631
632 flux_write_code = Block()
633
634 fluxes_by_bdry_number = {}
635 for flux_nr, wdflux in enumerate(self.fluxes):
636 for bflux_info in wdflux.boundaries:
637 if for_benchmark:
638 bdry_number = 0
639 else:
640 bdry_number = self.executor.boundary_tag_to_number[
641 bflux_info.bpair.tag]
642
643 fluxes_by_bdry_number.setdefault(bdry_number, [])\
644 .append((flux_nr, bflux_info))
645
646 flux_write_code.extend([
647 Initializer(
648 MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
649 0)
650 for flux_nr in range(len(self.fluxes))])
651
652 for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
653 bblock = []
654
655 from pytools import set_sum
656 int_deps = set_sum(flux_rec.int_dependencies
657 for flux_nr, flux_rec in nrs_and_fluxes)
658 ext_deps = set_sum(flux_rec.ext_dependencies
659 for flux_nr, flux_rec in nrs_and_fluxes)
660
661 for dep in int_deps:
662 bblock.append(
663 Initializer(
664 MaybeUnused(POD(given.float_type, "val_a_field%d"
665 % self.dep_to_index[dep])),
666 "fp_tex1Dfetch(field%d_tex, a_index)" % self.dep_to_index[dep]))
667 for dep in ext_deps:
668 bblock.append(
669 Initializer(
670 MaybeUnused(POD(given.float_type, "val_b_field%d"
671 % self.dep_to_index[dep])),
672 "fp_tex1Dfetch(field%s_tex, b_index)" % self.dep_to_index[dep]))
673
674 f2cm = FluxToCodeMapper(given.float_type)
675
676 comp_code = [Line()]
677 for flux_nr, flux_rec in nrs_and_fluxes:
678 comp_code.append(
679 Statement(("flux%d += " % flux_nr) +
680 flux_to_code(f2cm, is_flipped=False,
681 int_field_expr=flux_rec.bpair.field,
682 ext_field_expr=flux_rec.bpair.bfield,
683 dep_to_index=self.dep_to_index,
684 flux=flux_rec.flux_expr, prec=PREC_NONE)))
685
686 bblock.extend(
687 Initializer(
688 Value("value_type", f2cm.cse_prefix+str(i)), cse)
689 for i, cse in enumerate(f2cm.cses))
690
691 flux_write_code.extend([
692 Line(),
693 Comment(nrs_and_fluxes[0][1].bpair.tag),
694 If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
695 Block(bblock+comp_code)),
696 ])
697
698 flux_write_code.extend([Line(),]
699 +[
700 self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
701 "fpair->face_jacobian * flux%d" % flux_nr)
702 for flux_nr in range(len(self.fluxes))
703 ]
704
705 )
706
707 return flux_write_code
708
709 @memoize_method
710 - def get_kernel(self, fdata, ilist_data, for_benchmark):
711 from codepy.cgen.cuda import CudaShared, CudaGlobal
712 from pycuda.tools import dtype_to_ctype
713
714 discr = self.discr
715 given = self.plan.given
716 fplan = self.plan
717 d = discr.dimensions
718 dims = range(d)
719
720 elgroup, = discr.element_groups
721
722 float_type = given.float_type
723
724 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"),
725 [
726 Pointer(POD(float_type, "debugbuf")),
727 Pointer(POD(numpy.uint8, "gmem_facedata")),
728 ]+[
729 Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
730 for flux_nr in range(len(self.fluxes))
731 ]
732 ))
733
734 cmod = Module()
735 cmod.append(Include("pycuda-helpers.hpp"))
736
737 for dep_expr in self.all_deps:
738 cmod.extend([
739 Comment(str(dep_expr)),
740 Value("texture<%s, 1, cudaReadModeElementType>"
741 % dtype_to_ctype(float_type, with_fp_tex_hack=True),
742 "field%d_tex" % self.dep_to_index[dep_expr])
743 ])
744
745 if fplan.flux_count != len(self.fluxes):
746 from warnings import warn
747 warn("Flux count in flux execution plan different from actual flux count.\n"
748 "You may want to specify the tune_for= kwarg in the Discretization\n"
749 "constructor.")
750
751 cmod.extend([
752 Line(),
753 Typedef(POD(float_type, "value_type")),
754 Line(),
755 flux_header_struct(float_type, discr.dimensions),
756 Line(),
757 face_pair_struct(float_type, discr.dimensions),
758 Line(),
759 Define("DIMENSIONS", discr.dimensions),
760 Define("DOFS_PER_EL", given.dofs_per_el()),
761 Define("DOFS_PER_FACE", given.dofs_per_face()),
762 Define("THREADS_PER_FACE", fplan.threads_per_face()),
763 Line(),
764 Define("CONCURRENT_FACES", fplan.parallel_faces),
765 Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
766 Line(),
767 Define("FACEDOF_NR", "threadIdx.x"),
768 Define("BLOCK_FACE", "threadIdx.y"),
769 Line(),
770 Define("FLUX_COUNT", len(self.fluxes)),
771 Line(),
772 Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
773 Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
774 Define("COALESCING_THREAD_COUNT",
775 "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
776 Line(),
777 Define("DATA_BLOCK_SIZE", fdata.block_bytes),
778 Define("ALIGNED_FACE_DOFS_PER_MB", given.aligned_face_dofs_per_microblock()),
779 Define("ALIGNED_FACE_DOFS_PER_BLOCK",
780 "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
781 Line(),
782 Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
783 Line(),
784 ] + ilist_data.code + [
785 Line(),
786 Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
787 "tex_index_lists"),
788 Line(),
789 fdata.struct,
790 Line(),
791 CudaShared(Value("flux_data", "data")),
792 ])
793
794 if not fplan.direct_store:
795 cmod.extend([
796 CudaShared(
797 ArrayOf(
798 ArrayOf(
799 POD(float_type, "smem_fluxes_on_faces"),
800 "FLUX_COUNT"),
801 "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")
802 ),
803 Line(),
804 ])
805
806 S = Statement
807 f_body = Block()
808
809 from hedge.backends.cuda.tools import get_load_code
810
811 f_body.extend(get_load_code(
812 dest="&data",
813 base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
814 bytes="sizeof(flux_data)",
815 descr="load face_pair data")
816 +[ S("__syncthreads()"), Line() ])
817
818
819 def get_flux_code(flux_writer):
820 flux_code = Block([])
821
822 flux_code.extend([
823 Initializer(Pointer(
824 Value("face_pair", "fpair")),
825 "data.facepairs+fpair_nr"),
826 Initializer(
827 MaybeUnused(POD(numpy.uint32, "a_index")),
828 "fpair->a_base + tex1Dfetch(tex_index_lists, "
829 "fpair->a_ilist_index + FACEDOF_NR)"),
830 Initializer(
831 MaybeUnused(POD(numpy.uint32, "b_index")),
832 "fpair->b_base + tex1Dfetch(tex_index_lists, "
833 "fpair->b_ilist_index + FACEDOF_NR)"),
834 Line(),
835 flux_writer(),
836 Line(),
837 S("fpair_nr += CONCURRENT_FACES")
838 ])
839
840 return flux_code
841
842 flux_computation = Block([
843 Comment("fluxes for dual-sided (intra-block) interior face pairs"),
844 While("fpair_nr < data.header.same_facepairs_end",
845 get_flux_code(lambda:
846 self.write_interior_flux_code(True))
847 ),
848 Line(),
849 Comment("work around nvcc assertion failure"),
850 S("fpair_nr+=1"),
851 S("fpair_nr-=1"),
852 Line(),
853 Comment("fluxes for single-sided (inter-block) interior face pairs"),
854 While("fpair_nr < data.header.diff_facepairs_end",
855 get_flux_code(lambda:
856 self.write_interior_flux_code(False))
857 ),
858 Line(),
859 Comment("fluxes for single-sided boundary face pairs"),
860 While("fpair_nr < data.header.bdry_facepairs_end",
861 get_flux_code(
862 lambda: self.write_boundary_flux_code(for_benchmark))
863 ),
864 ])
865
866 f_body.extend_log_block("compute the fluxes", [
867 Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
868 If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
869 ])
870
871 f_body.extend([
872 Line(),
873 S("__syncthreads()"),
874 Line()
875 ])
876
877 if not fplan.direct_store:
878 f_body.extend_log_block("store fluxes", [
879
880
881
882 For("unsigned word_nr = THREAD_NUM",
883 "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
884 "word_nr += COALESCING_THREAD_COUNT",
885 Block([Assign(
886 "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr,
887 "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
888 for flux_nr in range(len(self.fluxes))]
889
890
891
892
893
894
895 )
896 )
897 ])
898 if False:
899 f_body.extend([
900 Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"),
901 Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"),
902 Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"),
903 Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"),
904 ]
905 )
906
907
908 cmod.append(FunctionBody(f_decl, f_body))
909
910 if not for_benchmark and "cuda_dump_kernels" in discr.debug:
911 open("flux_gather.cu", "w").write(str(cmod))
912
913
914 mod = SourceModule(
915
916 cmod,
917 keep="cuda_keep_kernels" in discr.debug,
918 options=["--maxrregcount=%d" % self.plan.max_registers()]
919 )
920 if "cuda_flux" in discr.debug:
921 print "flux: lmem=%d smem=%d regs=%d" % (
922 mod.local_size_bytes,
923 mod.shared_size_bytes,
924 mod.num_regs)
925
926 expr_to_texture_map = dict(
927 (dep_expr, mod.get_texref(
928 "field%d_tex" % self.dep_to_index[dep_expr]))
929 for dep_expr in self.all_deps)
930
931 index_list_texref = mod.get_texref("tex_index_lists")
932 index_list_texref.set_address(
933 ilist_data.device_memory,
934 ilist_data.bytes)
935 index_list_texref.set_format(
936 cuda.dtype_to_array_format(ilist_data.type), 1)
937 index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)
938
939 func = mod.get_function("apply_flux")
940 func.prepare(
941 (2+len(self.fluxes))*"P",
942 block=(fplan.threads_per_face(),
943 fplan.parallel_faces, 1),
944 texrefs=expr_to_texture_map.values()
945 + [index_list_texref])
946
947 return func, expr_to_texture_map
948
949
950 @memoize_method
972
973 int_fp_count, ext_fp_count, bdry_fp_count = 0, 0, 0
974
975
976 for block in discr.blocks:
977 ldis = block.local_discretization
978 face_dofs = ldis.face_node_count()
979
980 faces_todo = set((el,face_nbr)
981 for mb in block.microblocks
982 for el in mb
983 for face_nbr in range(ldis.face_count()))
984 same_fp_structs = []
985 diff_fp_structs = []
986 bdry_fp_structs = []
987
988 while faces_todo:
989 elface = faces_todo.pop()
990
991 a_face = discr.face_storage_map[elface]
992 b_face = a_face.opposite
993
994 if isinstance(b_face, GPUBoundaryFaceStorage):
995
996 b_base = b_face.gpu_bdry_index_in_floats
997 boundary_bitmap = self.executor.elface_to_bdry_bitmap.get(
998 a_face.el_face, 0)
999 b_write_index_list = 0
1000 b_dest = INVALID_DEST
1001
1002 fp_structs = bdry_fp_structs
1003 bdry_fp_count += 1
1004 else:
1005
1006 b_base = discr.find_el_gpu_index(b_face.el_face[0])
1007 boundary_bitmap = 0
1008
1009 if b_face.native_block == a_face.native_block:
1010
1011 faces_todo.remove(b_face.el_face)
1012 b_write_index_list = a_face.opp_write_index_list_id
1013 b_dest = find_elface_dest(b_face.el_face)
1014
1015 fp_structs = same_fp_structs
1016 int_fp_count += 1
1017 else:
1018
1019 b_write_index_list = 0
1020 b_dest = INVALID_DEST
1021
1022 fp_structs = diff_fp_structs
1023 ext_fp_count += 1
1024
1025 fp_structs.append(
1026 fp_struct.make(
1027 h=a_face.face_pair_side.h,
1028 order=a_face.face_pair_side.order,
1029 face_jacobian=a_face.face_pair_side.face_jacobian,
1030 normal=a_face.face_pair_side.normal,
1031
1032 a_base=discr.find_el_gpu_index(a_face.el_face[0]),
1033 b_base=b_base,
1034
1035 a_ilist_index= \
1036 a_face.global_int_flux_index_list_id*face_dofs,
1037 b_ilist_index= \
1038 a_face.global_ext_flux_index_list_id*face_dofs,
1039
1040 boundary_bitmap=boundary_bitmap,
1041 b_write_ilist_index= \
1042 b_write_index_list*face_dofs,
1043
1044 a_dest=find_elface_dest(a_face.el_face),
1045 b_dest=b_dest
1046 ))
1047
1048 headers.append(fh_struct.make(
1049 same_facepairs_end=\
1050 len(same_fp_structs),
1051 diff_facepairs_end=\
1052 len(same_fp_structs)+len(diff_fp_structs),
1053 bdry_facepairs_end=\
1054 len(same_fp_structs)+len(diff_fp_structs)
1055 +len(bdry_fp_structs),
1056 ))
1057 fp_blocks.append(
1058 same_fp_structs
1059 +diff_fp_structs
1060 +bdry_fp_structs)
1061
1062
1063
1064 from codepy.cgen import Value
1065 from hedge.backends.cuda.tools import make_superblocks
1066
1067 return make_superblocks(
1068 given.devdata, "flux_data",
1069 [(headers, Value(fh_struct.tpname, "header"))],
1070 [(fp_blocks, Value(fp_struct.tpname, "facepairs"))],
1071 extra_fields={
1072 "int_fp_count": int_fp_count,
1073 "ext_fp_count": ext_fp_count,
1074 "bdry_fp_count": bdry_fp_count,
1075 "fp_count": int_fp_count+ext_fp_count+bdry_fp_count,
1076 }
1077 )
1078
1079 @memoize_method
1081 discr = self.discr
1082 given = self.plan.given
1083
1084 fh_struct = flux_header_struct(given.float_type, discr.dimensions)
1085 fp_struct = face_pair_struct(given.float_type, discr.dimensions)
1086
1087 min_headers = []
1088 min_fp_blocks = []
1089
1090 from random import randrange, choice
1091
1092 face_dofs = given.dofs_per_face()
1093
1094 mp_count = discr.device.get_attribute(
1095 cuda.device_attribute.MULTIPROCESSOR_COUNT)
1096
1097 for block_nr in range(mp_count):
1098 fp_structs = []
1099
1100 faces = [(mb_nr, mb_el_nr, face_nr)
1101 for mb_nr in range(self.plan.microblocks_per_block())
1102 for mb_el_nr in range(given.microblock.elements)
1103 for face_nr in range(given.faces_per_el())]
1104
1105 def draw_base():
1106 mb_nr, mb_el_nr, face_nr = choice(faces)
1107 return (block_nr * given.microblock.aligned_floats
1108 * self.plan.microblocks_per_block()
1109 + mb_nr * given.microblock.aligned_floats
1110 + mb_el_nr * given.dofs_per_el())
1111
1112 def draw_dest():
1113 mb_nr, mb_el_nr, face_nr = choice(faces)
1114 return (mb_nr * given.aligned_face_dofs_per_microblock()
1115 + mb_el_nr * face_dofs * given.faces_per_el()
1116 + face_nr * face_dofs)
1117
1118 def bound_int(low, x, hi):
1119 return int(min(max(low, x), hi))
1120
1121 from random import gauss
1122 pdata = self.plan.partition_data
1123 fp_count = bound_int(
1124 0,
1125 gauss(
1126 pdata.face_pair_avg,
1127 (pdata.max_face_pair_count-pdata.face_pair_avg)/2),
1128 pdata.max_face_pair_count)
1129
1130
1131 for i in range(fp_count):
1132 fp_structs.append(
1133 fp_struct.make(
1134 h=0.5, order=2, face_jacobian=0.5,
1135 normal=discr.dimensions*[0.1],
1136
1137 a_base=draw_base(), b_base=draw_base(),
1138
1139 a_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs,
1140 b_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs,
1141
1142 boundary_bitmap=1,
1143 b_write_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs,
1144
1145 a_dest=draw_dest(), b_dest=draw_dest()
1146 ))
1147
1148 total_ext_face_count = bound_int(0,
1149 pdata.ext_face_avg + randrange(-1,2),
1150 fp_count)
1151
1152 bdry_count = min(total_ext_face_count,
1153 randrange(1+int(round(total_ext_face_count/6))))
1154 diff_count = total_ext_face_count-bdry_count
1155
1156 min_headers.append(fh_struct.make(
1157 same_facepairs_end=len(fp_structs)-total_ext_face_count,
1158 diff_facepairs_end=diff_count,
1159 bdry_facepairs_end=bdry_count))
1160 min_fp_blocks.append(fp_structs)
1161
1162 dups = block_count//mp_count + 1
1163 headers = (min_headers * dups)[:block_count]
1164 fp_blocks = (min_fp_blocks * dups)[:block_count]
1165
1166 from codepy.cgen import Value
1167 from hedge.backends.cuda.tools import make_superblocks
1168
1169 return make_superblocks(
1170 given.devdata, "flux_data",
1171 [(headers, Value(fh_struct.tpname, "header")) ],
1172 [(fp_blocks, Value(fp_struct.tpname, "facepairs"))]
1173 )
1174
1175
1176 FAKE_INDEX_LIST_COUNT = 30
1177
1178 @memoize_method
1181
1182 @memoize_method
1192
1194 from pytools import single_valued
1195 ilist_length = single_valued(len(il) for il in ilists)
1196 assert ilist_length == self.plan.given.dofs_per_face()
1197
1198 from codepy.cgen import Typedef, POD
1199
1200 from pytools import flatten
1201 flat_ilists_uncast = numpy.array(list(flatten(ilists)))
1202
1203 if numpy.max(flat_ilists_uncast) >= 256:
1204 tp = numpy.uint16
1205 else:
1206 tp = numpy.uint8
1207
1208 flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
1209 assert (flat_ilists == flat_ilists_uncast).all()
1210
1211 return GPUIndexLists(
1212 type=tp,
1213 code=[Typedef(POD(tp, "index_list_entry_t"))],
1214 device_memory=cuda.to_device(flat_ilists),
1215 bytes=flat_ilists.size*flat_ilists.itemsize,
1216 )
1217