Package hedge :: Package backends :: Package cuda :: Module fluxgather
[hide private]
[frames] | no frames]

Source Code for Module hedge.backends.cuda.fluxgather

   1  """Interface with Nvidia CUDA.""" 
   2   
   3  from __future__ import division 
   4   
   5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
   6   
   7  __license__ = """ 
   8  This program is free software: you can redistribute it and/or modify 
   9  it under the terms of the GNU General Public License as published by 
  10  the Free Software Foundation, either version 3 of the License, or 
  11  (at your option) any later version. 
  12   
  13  This program is distributed in the hope that it will be useful, 
  14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  16  GNU General Public License for more details. 
  17   
  18  You should have received a copy of the GNU General Public License 
  19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
  20  """ 
  21   
  22   
  23   
  24   
  25  import numpy 
  26  from pytools import memoize_method, memoize, Record 
  27  import pycuda.driver as cuda 
  28  import pycuda.gpuarray as gpuarray 
  29  from pycuda.compiler import SourceModule 
  30  import hedge.backends.cuda.plan 
  31  from pymbolic.mapper.c_code import CCodeMapper 
  32  from hedge.flux import FluxIdentityMapper 
  33   
  34  from codepy.cgen import \ 
  35          Pointer, POD, Value, ArrayOf, Typedef, \ 
  36          Module, FunctionDeclaration, FunctionBody, Block, \ 
  37          Comment, Line, Include, \ 
  38          MaybeUnused, \ 
  39          Define, \ 
  40          Initializer, If, For, Statement, Assign, While 
  41   
  42  from pymbolic.mapper.stringifier import PREC_NONE 
43 44 45 46 47 -class GPUIndexLists(Record): pass
48
49 50 51 52 # structures ------------------------------------------------------------------ 53 @memoize 54 -def face_pair_struct(float_type, dims):
55 from codepy.cgen import GenerableStruct, POD, ArrayOf 56 return GenerableStruct("face_pair", [ 57 POD(float_type, "h", ), 58 POD(float_type, "order"), 59 POD(float_type, "face_jacobian"), 60 ArrayOf(POD(float_type, "normal"), dims), 61 62 POD(numpy.uint32, "a_base"), 63 POD(numpy.uint32, "b_base"), 64 65 POD(numpy.uint16, "a_ilist_index"), 66 POD(numpy.uint16, "b_ilist_index"), 67 POD(numpy.uint16, "b_write_ilist_index"), 68 POD(numpy.uint16, "boundary_bitmap"), 69 POD(numpy.uint16, "a_dest"), 70 POD(numpy.uint16, "b_dest"), 71 ], 72 73 # ensure that adjacent face_pair instances can be accessed 74 # without bank conflicts. 75 aligned_prime_to=[2], 76 )
77
78 @memoize 79 -def flux_header_struct(float_type, dims):
80 from codepy.cgen import GenerableStruct, POD 81 82 return GenerableStruct("flux_header", [ 83 POD(numpy.uint16, "same_facepairs_end"), 84 POD(numpy.uint16, "diff_facepairs_end"), 85 POD(numpy.uint16, "bdry_facepairs_end"), 86 ], align_bytes=face_pair_struct(float_type, dims).alignment_requirement())
87
88 89 90 91 # flux to code mapper --------------------------------------------------------- 92 -class FluxConcretizer(FluxIdentityMapper):
93 - def __init__(self, int_field_expr, ext_field_expr, dep_to_index):
94 self.int_field_expr = int_field_expr 95 self.ext_field_expr = ext_field_expr 96 self.dep_to_index = dep_to_index
97
98 - def map_field_component(self, expr):
99 if expr.is_local: 100 prefix = "a" 101 f_expr = self.int_field_expr 102 else: 103 prefix = "b" 104 f_expr = self.ext_field_expr 105 106 from hedge.tools import is_obj_array, is_zero 107 from pymbolic import var 108 if is_obj_array(f_expr): 109 f_expr = f_expr[expr.index] 110 if is_zero(f_expr): 111 return 0 112 return var("val_%s_field%d" % (prefix, self.dep_to_index[f_expr])) 113 else: 114 assert expr.index == 0, repr(f_expr) 115 if is_zero(f_expr): 116 return 0 117 return var("val_%s_field%d" % (prefix, self.dep_to_index[f_expr]))
118
119 120 121 122 -class FluxToCodeMapper(CCodeMapper):
123 - def __init__(self, dtype):
124 self.dtype = dtype 125 126 def float_mapper(x): 127 if isinstance(x, float): 128 return "value_type(%s)" % repr(x) 129 else: 130 return repr(x)
131 132 CCodeMapper.__init__(self, float_mapper, reverse=False)
133
134 - def map_normal(self, expr, enclosing_prec):
135 return "fpair->normal[%d]" % expr.axis
136
137 - def map_penalty_term(self, expr, enclosing_prec):
138 return ("pow(fpair->order*fpair->order/fpair->h, %r)" 139 % expr.power)
140
141 - def map_function_symbol(self, expr, enclosing_prec):
142 from hedge.flux import FluxFunctionSymbol, \ 143 flux_abs, flux_min, flux_max 144 145 assert isinstance(expr, FluxFunctionSymbol) 146 147 if self.dtype == numpy.float32: 148 suffix = "f" 149 elif self.dtype == numpy.float64: 150 suffix = "" 151 else: 152 raise RuntimeError("invalid flux dtype: %s" % self.dtype) 153 154 return { 155 flux_abs: "fabs", 156 flux_max: "fmax", 157 flux_min: "fmin", 158 }[expr] + suffix
159
160 161 162 163 164 -def flux_to_code(f2cm, is_flipped, int_field_expr, ext_field_expr, 165 dep_to_index, flux, prec):
166 if is_flipped: 167 from hedge.flux import FluxFlipper 168 flux = FluxFlipper()(flux) 169 170 return f2cm(FluxConcretizer(int_field_expr, ext_field_expr, dep_to_index)(flux), prec)
171
172 173 174 175 # plan ------------------------------------------------------------------------ 176 -class ExecutionPlan(hedge.backends.cuda.plan.ExecutionPlan):
177 - def __init__(self, given, 178 parallel_faces, mbs_per_block, flux_count, 179 direct_store, partition_data):
180 hedge.backends.cuda.plan.ExecutionPlan.__init__(self, given) 181 self.parallel_faces = parallel_faces 182 self.mbs_per_block = mbs_per_block 183 self.flux_count = flux_count 184 self.direct_store = direct_store 185 186 self.partition_data = partition_data
187
188 - def microblocks_per_block(self):
189 return self.mbs_per_block
190
191 - def elements_per_block(self):
192 return self.microblocks_per_block()*self.given.microblock.elements
193
194 - def dofs_per_block(self):
195 return self.microblocks_per_block()*self.given.microblock.aligned_floats
196 197 @memoize_method
198 - def face_pair_count(self):
199 return self.partition_data.max_face_pair_count
200 201 @memoize_method
202 - def shared_mem_use(self):
203 from hedge.backends.cuda.fluxgather import face_pair_struct 204 d = self.given.ldis.dimensions 205 206 if self.given.dofs_per_face() > 255: 207 index_lists_entry_size = 2 208 else: 209 index_lists_entry_size = 1 210 211 result = (128 # parameters, block header, small extra stuff 212 + len(face_pair_struct(self.given.float_type, d)) 213 * self.face_pair_count()) 214 215 if not self.direct_store: 216 result += (self.given.aligned_face_dofs_per_microblock() 217 * self.flux_count 218 * self.microblocks_per_block() 219 * self.given.float_size()) 220 221 return result
222
223 - def threads_per_face(self):
224 dpf = self.given.dofs_per_face() 225 226 devdata = self.given.devdata 227 if dpf % devdata.smem_granularity >= devdata.smem_granularity // 2: 228 from hedge.backends.cuda.tools import int_ceiling 229 return int_ceiling(dpf, devdata.smem_granularity) 230 else: 231 return dpf
232
233 - def threads(self):
234 return self.parallel_faces*self.threads_per_face()
235
236 - def registers(self):
237 return 51
238
239 - def __str__(self):
240 result = ("%s pfaces=%d mbs_per_block=%d mb_elements=%d" % ( 241 hedge.backends.cuda.plan.ExecutionPlan.__str__(self), 242 self.parallel_faces, 243 self.mbs_per_block, 244 self.given.microblock.elements, 245 )) 246 247 if self.direct_store: 248 result += " direct_store" 249 250 return result
251
252 - def make_kernel(self, discr, executor, fluxes):
253 return Kernel(discr, self, executor, fluxes)
254
255 256 257 258 259 -def make_plan(discr, given, tune_for):
260 from hedge.backends.cuda.execute import Executor 261 if tune_for is not None: 262 fbatch1 = Executor.get_first_flux_batch(discr.mesh, tune_for) 263 if fbatch1 is not None: 264 fluxes = list(fbatch1.flux_exprs) 265 flux_count = len(fluxes) 266 else: 267 fluxes = None 268 else: 269 fluxes = None 270 271 if fluxes is None: 272 # a reasonable guess? 273 flux_count = discr.dimensions 274 275 def generate_valid_plans(): 276 valid_plan_count = 0 277 for direct_store in [False, True]: 278 for parallel_faces in range(1, 32): 279 for mbs_per_block in range(1, 8): 280 flux_plan = ExecutionPlan(given, parallel_faces, 281 mbs_per_block, flux_count, 282 direct_store=direct_store, 283 partition_data=discr._get_partition_data( 284 mbs_per_block*given.microblock.elements)) 285 if flux_plan.invalid_reason() is None: 286 valid_plan_count += 1 287 yield flux_plan 288 289 # if there are valid plans *without* direct_store *and* we're using 290 # single precision, then bail now: It's unlikely that direct-store 291 # offers any benefit. 292 if valid_plan_count and given.float_type == numpy.float32: 293 return
294 295 def target_func(plan): 296 if tune_for is None: 297 return 0 298 else: 299 return plan.make_kernel(discr, executor=None, 300 fluxes=fluxes).benchmark() 301 302 from hedge.backends.cuda.plan import optimize_plan 303 304 return optimize_plan( 305 "gather", 306 generate_valid_plans, target_func, 307 maximize=False, 308 debug_flags=discr.debug) 309
310 311 312 313 # flux gather kernel ---------------------------------------------------------- 314 -class Kernel:
315 - def __init__(self, discr, plan, executor, fluxes):
316 self.discr = discr 317 self.plan = plan 318 self.executor = executor 319 320 assert isinstance(fluxes, list) 321 self.fluxes = fluxes 322 323 interior_deps_set = set() 324 boundary_int_deps_set = set() 325 boundary_ext_deps_set = set() 326 self.dep_to_tag = {} 327 for f in fluxes: 328 interior_deps_set.update(set(f.interior_deps)) 329 boundary_int_deps_set.update(set(f.boundary_int_deps)) 330 boundary_ext_deps_set.update(set(f.boundary_ext_deps)) 331 self.dep_to_tag.update(f.dep_to_tag) 332 333 self.interior_deps = list(interior_deps_set) 334 self.boundary_int_deps = list(boundary_int_deps_set) 335 self.boundary_ext_deps = list(boundary_ext_deps_set) 336 self.all_deps = list( 337 interior_deps_set 338 | boundary_int_deps_set 339 | boundary_ext_deps_set 340 ) 341 342 self.dep_to_index = dict((dep, i) for i, dep in enumerate(self.all_deps))
343
344 - def benchmark(self):
345 discr = self.discr 346 given = self.plan.given 347 348 from hedge.backends.cuda.tools import int_ceiling 349 block_count = int_ceiling( 350 len(discr.mesh.elements)/self.plan.elements_per_block()) 351 all_fluxes_on_faces = [gpuarray.empty( 352 (block_count * self.plan.microblocks_per_block() 353 * given.aligned_face_dofs_per_microblock(),), 354 dtype=given.float_type, 355 allocator=discr.pool.allocate) 356 for i in range(len(self.fluxes))] 357 358 field = gpuarray.empty( 359 (self.plan.dofs_per_block() * block_count,), 360 dtype=given.float_type, 361 allocator=discr.pool.allocate) 362 363 fdata = self.fake_flux_face_data_block(block_count) 364 ilist_data = self.fake_index_list_data() 365 366 gather, texref_map = self.get_kernel(fdata, ilist_data, 367 for_benchmark=True) 368 369 for dep_expr in self.all_deps: 370 field.bind_to_texref_ext(texref_map[dep_expr], 371 allow_double_hack=True) 372 373 if "cuda_fastbench" in discr.debug: 374 count = 1 375 else: 376 count = 20 377 378 start = cuda.Event() 379 start.record() 380 for i in range(count): 381 try: 382 gather.prepared_call( 383 (block_count, 1), 384 0, 385 fdata.device_memory, 386 *tuple(fof.gpudata for fof in all_fluxes_on_faces) 387 ) 388 except cuda.LaunchError: 389 return None 390 391 stop = cuda.Event() 392 stop.record() 393 stop.synchronize() 394 395 return 1e-3/count * stop.time_since(start)
396
397 - def __call__(self, eval_dependency, lift_plan):
398 discr = self.discr 399 given = self.plan.given 400 elgroup, = discr.element_groups 401 402 all_fluxes_on_faces = [gpuarray.empty( 403 given.matmul_preimage_shape(lift_plan), 404 dtype=given.float_type, 405 allocator=discr.pool.allocate) 406 for i in range(len(self.fluxes))] 407 408 fdata = self.flux_face_data_block(elgroup) 409 ilist_data = self.index_list_data() 410 411 gather, texref_map = self.get_kernel(fdata, ilist_data, 412 for_benchmark=False) 413 414 for dep_expr in self.all_deps: 415 dep_field = eval_dependency(dep_expr) 416 417 from hedge.tools import is_zero 418 if is_zero(dep_field): 419 if dep_expr in self.dep_to_tag: 420 dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) 421 else: 422 dep_field = discr.volume_zeros() 423 424 assert dep_field.dtype == given.float_type 425 dep_field.bind_to_texref_ext(texref_map[dep_expr], 426 allow_double_hack=True) 427 428 if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: 429 debugbuf = gpuarray.zeros((10000,), dtype=given.float_type) 430 else: 431 from hedge.backends.cuda.tools import FakeGPUArray 432 debugbuf = FakeGPUArray() 433 434 if discr.instrumented: 435 discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call( 436 (len(discr.blocks), 1), 437 debugbuf.gpudata, 438 fdata.device_memory, 439 *tuple(fof.gpudata for fof in all_fluxes_on_faces) 440 )) 441 442 discr.gmem_bytes_gather.add( 443 len(discr.blocks) * fdata.block_bytes 444 + 445 given.float_size() 446 * ( 447 # fetch 448 len(self.fluxes) 449 * 2*fdata.fp_count 450 * given.dofs_per_face() 451 452 # store 453 + len(discr.blocks) 454 * len(self.fluxes) 455 * self.plan.microblocks_per_block() 456 * given.aligned_face_dofs_per_microblock() 457 )) 458 else: 459 gather.prepared_call( 460 (len(discr.blocks), 1), 461 debugbuf.gpudata, 462 fdata.device_memory, 463 *tuple(fof.gpudata for fof in all_fluxes_on_faces) 464 ) 465 466 if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: 467 from hedge.tools import get_rank, wait_for_keypress 468 if get_rank(discr) == 0: 469 copied_debugbuf = debugbuf.get() 470 print "DEBUG", len(discr.blocks) 471 numpy.set_printoptions(linewidth=130) 472 #print numpy.reshape(copied_debugbuf, (32, 16)) 473 #print copied_debugbuf[:50] 474 475 for i in range(len(discr.blocks)*6): 476 print i, copied_debugbuf[i*16:(i+1)*16] 477 #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] 478 479 wait_for_keypress(discr) 480 481 if "cuda_flux" in discr.debug: 482 from hedge.tools import get_rank, wait_for_keypress 483 if get_rank(discr) == 0: 484 numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) 485 if True: 486 487 cols = [] 488 for k in range(len(all_fluxes_on_faces)): 489 my_fof = all_fluxes_on_faces[k].get() 490 def sstruc(a): 491 result = "" 492 for i in a: 493 if i == 0: 494 result += "0" 495 elif abs(i) < 1e-10: 496 result += "-" 497 elif numpy.isnan(i): 498 result += "N" 499 elif i == 17: 500 result += "*" 501 else: 502 result += "#" 503 504 return result
505 506 useful_sz = given.block_count \ 507 * given.microblocks_per_block \ 508 * lift_plan.aligned_preimage_dofs_per_microblock 509 510 my_col = [] 511 i = 0 512 while i < useful_sz: 513 my_col.append(sstruc(my_fof[i:i+16])) 514 i += 16 515 516 cols.append(my_col) 517 518 from pytools import Table 519 tbl = Table() 520 tbl.add_row(["num"]+range(len(cols))) 521 i = 0 522 for row in zip(*cols): 523 tbl.add_row((i,)+row) 524 i += 1 525 print tbl 526 else: 527 for i in range(len(all_fluxes_on_faces)): 528 print i 529 print all_fluxes_on_faces[i].get() 530 531 wait_for_keypress(discr) 532 #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] 533 534 return all_fluxes_on_faces
535
536 - def gen_store(self, flux_nr, index, what):
537 if self.plan.direct_store: 538 return Assign( 539 "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE + %s]" % (flux_nr, index), 540 what) 541 else: 542 return Assign( 543 "smem_fluxes_on_faces[%d][%s]" % (flux_nr, index), 544 what)
545
546 - def write_interior_flux_code(self, is_twosided):
547 given = self.plan.given 548 549 def get_field(flux_rec, is_interior, flipped): 550 if is_interior ^ flipped: 551 prefix = "a" 552 else: 553 prefix = "b" 554 555 return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr]))
556 557 flux_write_code = Block([]) 558 559 flux_var_decl = [Initializer( POD(given.float_type, "a_flux"), 0)] 560 561 if is_twosided: 562 flux_var_decl.append(Initializer(POD(given.float_type, "b_flux"), 0)) 563 prefixes = ["a", "b"] 564 flip_values = [False, True] 565 else: 566 prefixes = ["a"] 567 flip_values = [False] 568 569 flux_write_code.append(Line()) 570 571 for dep in self.interior_deps: 572 for side in ["a", "b"]: 573 flux_write_code.append( 574 Initializer( 575 MaybeUnused(POD(given.float_type, "val_%s_field%d" 576 % (side, self.dep_to_index[dep]))), 577 "fp_tex1Dfetch(field%d_tex, %s_index)" 578 % (self.dep_to_index[dep], side))) 579 580 f2cm = FluxToCodeMapper(given.float_type) 581 582 flux_sub_codes = [] 583 for flux_nr, wdflux in enumerate(self.fluxes): 584 my_flux_block = Block(flux_var_decl) 585 586 for int_rec in wdflux.interiors: 587 for prefix, is_flipped in zip(prefixes, flip_values): 588 my_flux_block.append( 589 Statement("%s_flux += %s" 590 % (prefix, 591 flux_to_code(f2cm, is_flipped, 592 int_rec.field_expr, 593 int_rec.field_expr, 594 self.dep_to_index, 595 int_rec.flux_expr, PREC_NONE), 596 ))) 597 598 my_flux_block.append(Line()) 599 600 my_flux_block.append( 601 self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", 602 "fpair->face_jacobian*a_flux")) 603 604 #my_flux_block.append( 605 #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"), 606 #) 607 608 if is_twosided: 609 my_flux_block.append( 610 self.gen_store(flux_nr, 611 "fpair->b_dest+tex1Dfetch(tex_index_lists, " 612 "fpair->b_write_ilist_index + FACEDOF_NR)", 613 "fpair->face_jacobian*b_flux")) 614 615 #my_flux_block.append( 616 #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"), 617 #) 618 619 flux_sub_codes.append(my_flux_block) 620 621 flux_write_code.extend( 622 Initializer( 623 Value("value_type", f2cm.cse_prefix+str(i)), cse) 624 for i, cse in f2cm.cses) 625 flux_write_code.extend(flux_sub_codes) 626 627 return flux_write_code 628
629 - def write_boundary_flux_code(self, for_benchmark):
630 given = self.plan.given 631 632 flux_write_code = Block() 633 634 fluxes_by_bdry_number = {} 635 for flux_nr, wdflux in enumerate(self.fluxes): 636 for bflux_info in wdflux.boundaries: 637 if for_benchmark: 638 bdry_number = 0 639 else: 640 bdry_number = self.executor.boundary_tag_to_number[ 641 bflux_info.bpair.tag] 642 643 fluxes_by_bdry_number.setdefault(bdry_number, [])\ 644 .append((flux_nr, bflux_info)) 645 646 flux_write_code.extend([ 647 Initializer( 648 MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)), 649 0) 650 for flux_nr in range(len(self.fluxes))]) 651 652 for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems(): 653 bblock = [] 654 655 from pytools import set_sum 656 int_deps = set_sum(flux_rec.int_dependencies 657 for flux_nr, flux_rec in nrs_and_fluxes) 658 ext_deps = set_sum(flux_rec.ext_dependencies 659 for flux_nr, flux_rec in nrs_and_fluxes) 660 661 for dep in int_deps: 662 bblock.append( 663 Initializer( 664 MaybeUnused(POD(given.float_type, "val_a_field%d" 665 % self.dep_to_index[dep])), 666 "fp_tex1Dfetch(field%d_tex, a_index)" % self.dep_to_index[dep])) 667 for dep in ext_deps: 668 bblock.append( 669 Initializer( 670 MaybeUnused(POD(given.float_type, "val_b_field%d" 671 % self.dep_to_index[dep])), 672 "fp_tex1Dfetch(field%s_tex, b_index)" % self.dep_to_index[dep])) 673 674 f2cm = FluxToCodeMapper(given.float_type) 675 676 comp_code = [Line()] 677 for flux_nr, flux_rec in nrs_and_fluxes: 678 comp_code.append( 679 Statement(("flux%d += " % flux_nr) + 680 flux_to_code(f2cm, is_flipped=False, 681 int_field_expr=flux_rec.bpair.field, 682 ext_field_expr=flux_rec.bpair.bfield, 683 dep_to_index=self.dep_to_index, 684 flux=flux_rec.flux_expr, prec=PREC_NONE))) 685 686 bblock.extend( 687 Initializer( 688 Value("value_type", f2cm.cse_prefix+str(i)), cse) 689 for i, cse in enumerate(f2cm.cses)) 690 691 flux_write_code.extend([ 692 Line(), 693 Comment(nrs_and_fluxes[0][1].bpair.tag), 694 If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number), 695 Block(bblock+comp_code)), 696 ]) 697 698 flux_write_code.extend([Line(),] 699 +[ 700 self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", 701 "fpair->face_jacobian * flux%d" % flux_nr) 702 for flux_nr in range(len(self.fluxes)) 703 ] 704 #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"), 705 ) 706 707 return flux_write_code
708 709 @memoize_method
710 - def get_kernel(self, fdata, ilist_data, for_benchmark):
711 from codepy.cgen.cuda import CudaShared, CudaGlobal 712 from pycuda.tools import dtype_to_ctype 713 714 discr = self.discr 715 given = self.plan.given 716 fplan = self.plan 717 d = discr.dimensions 718 dims = range(d) 719 720 elgroup, = discr.element_groups 721 722 float_type = given.float_type 723 724 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"), 725 [ 726 Pointer(POD(float_type, "debugbuf")), 727 Pointer(POD(numpy.uint8, "gmem_facedata")), 728 ]+[ 729 Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) 730 for flux_nr in range(len(self.fluxes)) 731 ] 732 )) 733 734 cmod = Module() 735 cmod.append(Include("pycuda-helpers.hpp")) 736 737 for dep_expr in self.all_deps: 738 cmod.extend([ 739 Comment(str(dep_expr)), 740 Value("texture<%s, 1, cudaReadModeElementType>" 741 % dtype_to_ctype(float_type, with_fp_tex_hack=True), 742 "field%d_tex" % self.dep_to_index[dep_expr]) 743 ]) 744 745 if fplan.flux_count != len(self.fluxes): 746 from warnings import warn 747 warn("Flux count in flux execution plan different from actual flux count.\n" 748 "You may want to specify the tune_for= kwarg in the Discretization\n" 749 "constructor.") 750 751 cmod.extend([ 752 Line(), 753 Typedef(POD(float_type, "value_type")), 754 Line(), 755 flux_header_struct(float_type, discr.dimensions), 756 Line(), 757 face_pair_struct(float_type, discr.dimensions), 758 Line(), 759 Define("DIMENSIONS", discr.dimensions), 760 Define("DOFS_PER_EL", given.dofs_per_el()), 761 Define("DOFS_PER_FACE", given.dofs_per_face()), 762 Define("THREADS_PER_FACE", fplan.threads_per_face()), 763 Line(), 764 Define("CONCURRENT_FACES", fplan.parallel_faces), 765 Define("BLOCK_MB_COUNT", fplan.mbs_per_block), 766 Line(), 767 Define("FACEDOF_NR", "threadIdx.x"), 768 Define("BLOCK_FACE", "threadIdx.y"), 769 Line(), 770 Define("FLUX_COUNT", len(self.fluxes)), 771 Line(), 772 Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), 773 Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), 774 Define("COALESCING_THREAD_COUNT", 775 "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), 776 Line(), 777 Define("DATA_BLOCK_SIZE", fdata.block_bytes), 778 Define("ALIGNED_FACE_DOFS_PER_MB", given.aligned_face_dofs_per_microblock()), 779 Define("ALIGNED_FACE_DOFS_PER_BLOCK", 780 "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), 781 Line(), 782 Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), 783 Line(), 784 ] + ilist_data.code + [ 785 Line(), 786 Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", 787 "tex_index_lists"), 788 Line(), 789 fdata.struct, 790 Line(), 791 CudaShared(Value("flux_data", "data")), 792 ]) 793 794 if not fplan.direct_store: 795 cmod.extend([ 796 CudaShared( 797 ArrayOf( 798 ArrayOf( 799 POD(float_type, "smem_fluxes_on_faces"), 800 "FLUX_COUNT"), 801 "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT") 802 ), 803 Line(), 804 ]) 805 806 S = Statement 807 f_body = Block() 808 809 from hedge.backends.cuda.tools import get_load_code 810 811 f_body.extend(get_load_code( 812 dest="&data", 813 base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", 814 bytes="sizeof(flux_data)", 815 descr="load face_pair data") 816 +[ S("__syncthreads()"), Line() ]) 817 818 819 def get_flux_code(flux_writer): 820 flux_code = Block([]) 821 822 flux_code.extend([ 823 Initializer(Pointer( 824 Value("face_pair", "fpair")), 825 "data.facepairs+fpair_nr"), 826 Initializer( 827 MaybeUnused(POD(numpy.uint32, "a_index")), 828 "fpair->a_base + tex1Dfetch(tex_index_lists, " 829 "fpair->a_ilist_index + FACEDOF_NR)"), 830 Initializer( 831 MaybeUnused(POD(numpy.uint32, "b_index")), 832 "fpair->b_base + tex1Dfetch(tex_index_lists, " 833 "fpair->b_ilist_index + FACEDOF_NR)"), 834 Line(), 835 flux_writer(), 836 Line(), 837 S("fpair_nr += CONCURRENT_FACES") 838 ]) 839 840 return flux_code
841 842 flux_computation = Block([ 843 Comment("fluxes for dual-sided (intra-block) interior face pairs"), 844 While("fpair_nr < data.header.same_facepairs_end", 845 get_flux_code(lambda: 846 self.write_interior_flux_code(True)) 847 ), 848 Line(), 849 Comment("work around nvcc assertion failure"), 850 S("fpair_nr+=1"), 851 S("fpair_nr-=1"), 852 Line(), 853 Comment("fluxes for single-sided (inter-block) interior face pairs"), 854 While("fpair_nr < data.header.diff_facepairs_end", 855 get_flux_code(lambda: 856 self.write_interior_flux_code(False)) 857 ), 858 Line(), 859 Comment("fluxes for single-sided boundary face pairs"), 860 While("fpair_nr < data.header.bdry_facepairs_end", 861 get_flux_code( 862 lambda: self.write_boundary_flux_code(for_benchmark)) 863 ), 864 ]) 865 866 f_body.extend_log_block("compute the fluxes", [ 867 Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), 868 If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) 869 ]) 870 871 f_body.extend([ 872 Line(), 873 S("__syncthreads()"), 874 Line() 875 ]) 876 877 if not fplan.direct_store: 878 f_body.extend_log_block("store fluxes", [ 879 #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), 880 #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), 881 #Assign("debugbuf[0]", "sizeof(face_pair)"), 882 For("unsigned word_nr = THREAD_NUM", 883 "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", 884 "word_nr += COALESCING_THREAD_COUNT", 885 Block([Assign( 886 "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, 887 "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) 888 for flux_nr in range(len(self.fluxes))] 889 #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, 890 #Block([ 891 #Assign("debugbuf[blockIdx.x]", "word_nr"), 892 #]) 893 #) 894 #for flux_nr in range(len(self.fluxes))] 895 ) 896 ) 897 ]) 898 if False: 899 f_body.extend([ 900 Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), 901 Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), 902 Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), 903 Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), 904 ] 905 ) 906 907 # finish off ---------------------------------------------------------- 908 cmod.append(FunctionBody(f_decl, f_body)) 909 910 if not for_benchmark and "cuda_dump_kernels" in discr.debug: 911 open("flux_gather.cu", "w").write(str(cmod)) 912 913 #from pycuda.tools import allow_user_edit 914 mod = SourceModule( 915 #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), 916 cmod, 917 keep="cuda_keep_kernels" in discr.debug, 918 options=["--maxrregcount=%d" % self.plan.max_registers()] 919 ) 920 if "cuda_flux" in discr.debug: 921 print "flux: lmem=%d smem=%d regs=%d" % ( 922 mod.local_size_bytes, 923 mod.shared_size_bytes, 924 mod.num_regs) 925 926 expr_to_texture_map = dict( 927 (dep_expr, mod.get_texref( 928 "field%d_tex" % self.dep_to_index[dep_expr])) 929 for dep_expr in self.all_deps) 930 931 index_list_texref = mod.get_texref("tex_index_lists") 932 index_list_texref.set_address( 933 ilist_data.device_memory, 934 ilist_data.bytes) 935 index_list_texref.set_format( 936 cuda.dtype_to_array_format(ilist_data.type), 1) 937 index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) 938 939 func = mod.get_function("apply_flux") 940 func.prepare( 941 (2+len(self.fluxes))*"P", 942 block=(fplan.threads_per_face(), 943 fplan.parallel_faces, 1), 944 texrefs=expr_to_texture_map.values() 945 + [index_list_texref]) 946 947 return func, expr_to_texture_map 948 949 # data blocks ------------------------------------------------------------- 950 @memoize_method
951 - def flux_face_data_block(self, elgroup):
952 discr = self.discr 953 given = self.plan.given 954 fplan = discr.flux_plan 955 headers = [] 956 fp_blocks = [] 957 958 INVALID_DEST = (1<<16)-1 959 960 from hedge.backends.cuda import GPUBoundaryFaceStorage 961 962 fh_struct = flux_header_struct(given.float_type, discr.dimensions) 963 fp_struct = face_pair_struct(given.float_type, discr.dimensions) 964 965 def find_elface_dest(el_face): 966 elface_dofs = face_dofs*ldis.face_count() 967 num_in_block = discr.find_number_in_block(el_face[0]) 968 mb_index, index_in_mb = divmod(num_in_block, given.microblock.elements) 969 return (mb_index * given.aligned_face_dofs_per_microblock() 970 + index_in_mb * elface_dofs 971 + el_face[1]*face_dofs)
972 973 int_fp_count, ext_fp_count, bdry_fp_count = 0, 0, 0 974 975 976 for block in discr.blocks: 977 ldis = block.local_discretization 978 face_dofs = ldis.face_node_count() 979 980 faces_todo = set((el,face_nbr) 981 for mb in block.microblocks 982 for el in mb 983 for face_nbr in range(ldis.face_count())) 984 same_fp_structs = [] 985 diff_fp_structs = [] 986 bdry_fp_structs = [] 987 988 while faces_todo: 989 elface = faces_todo.pop() 990 991 a_face = discr.face_storage_map[elface] 992 b_face = a_face.opposite 993 994 if isinstance(b_face, GPUBoundaryFaceStorage): 995 # boundary face 996 b_base = b_face.gpu_bdry_index_in_floats 997 boundary_bitmap = self.executor.elface_to_bdry_bitmap.get( 998 a_face.el_face, 0) 999 b_write_index_list = 0 # doesn't matter 1000 b_dest = INVALID_DEST 1001 1002 fp_structs = bdry_fp_structs 1003 bdry_fp_count += 1 1004 else: 1005 # interior face 1006 b_base = discr.find_el_gpu_index(b_face.el_face[0]) 1007 boundary_bitmap = 0 1008 1009 if b_face.native_block == a_face.native_block: 1010 # same block 1011 faces_todo.remove(b_face.el_face) 1012 b_write_index_list = a_face.opp_write_index_list_id 1013 b_dest = find_elface_dest(b_face.el_face) 1014 1015 fp_structs = same_fp_structs 1016 int_fp_count += 1 1017 else: 1018 # different block 1019 b_write_index_list = 0 # doesn't matter 1020 b_dest = INVALID_DEST 1021 1022 fp_structs = diff_fp_structs 1023 ext_fp_count += 1 1024 1025 fp_structs.append( 1026 fp_struct.make( 1027 h=a_face.face_pair_side.h, 1028 order=a_face.face_pair_side.order, 1029 face_jacobian=a_face.face_pair_side.face_jacobian, 1030 normal=a_face.face_pair_side.normal, 1031 1032 a_base=discr.find_el_gpu_index(a_face.el_face[0]), 1033 b_base=b_base, 1034 1035 a_ilist_index= \ 1036 a_face.global_int_flux_index_list_id*face_dofs, 1037 b_ilist_index= \ 1038 a_face.global_ext_flux_index_list_id*face_dofs, 1039 1040 boundary_bitmap=boundary_bitmap, 1041 b_write_ilist_index= \ 1042 b_write_index_list*face_dofs, 1043 1044 a_dest=find_elface_dest(a_face.el_face), 1045 b_dest=b_dest 1046 )) 1047 1048 headers.append(fh_struct.make( 1049 same_facepairs_end=\ 1050 len(same_fp_structs), 1051 diff_facepairs_end=\ 1052 len(same_fp_structs)+len(diff_fp_structs), 1053 bdry_facepairs_end=\ 1054 len(same_fp_structs)+len(diff_fp_structs) 1055 +len(bdry_fp_structs), 1056 )) 1057 fp_blocks.append( 1058 same_fp_structs 1059 +diff_fp_structs 1060 +bdry_fp_structs) 1061 1062 #print len(same_fp_structs), len(diff_fp_structs), len(bdry_fp_structs) 1063 1064 from codepy.cgen import Value 1065 from hedge.backends.cuda.tools import make_superblocks 1066 1067 return make_superblocks( 1068 given.devdata, "flux_data", 1069 [(headers, Value(fh_struct.tpname, "header"))], 1070 [(fp_blocks, Value(fp_struct.tpname, "facepairs"))], 1071 extra_fields={ 1072 "int_fp_count": int_fp_count, 1073 "ext_fp_count": ext_fp_count, 1074 "bdry_fp_count": bdry_fp_count, 1075 "fp_count": int_fp_count+ext_fp_count+bdry_fp_count, 1076 } 1077 ) 1078 1079 @memoize_method
1080 - def fake_flux_face_data_block(self, block_count):
1081 discr = self.discr 1082 given = self.plan.given 1083 1084 fh_struct = flux_header_struct(given.float_type, discr.dimensions) 1085 fp_struct = face_pair_struct(given.float_type, discr.dimensions) 1086 1087 min_headers = [] 1088 min_fp_blocks = [] 1089 1090 from random import randrange, choice 1091 1092 face_dofs = given.dofs_per_face() 1093 1094 mp_count = discr.device.get_attribute( 1095 cuda.device_attribute.MULTIPROCESSOR_COUNT) 1096 1097 for block_nr in range(mp_count): 1098 fp_structs = [] 1099 1100 faces = [(mb_nr, mb_el_nr, face_nr) 1101 for mb_nr in range(self.plan.microblocks_per_block()) 1102 for mb_el_nr in range(given.microblock.elements) 1103 for face_nr in range(given.faces_per_el())] 1104 1105 def draw_base(): 1106 mb_nr, mb_el_nr, face_nr = choice(faces) 1107 return (block_nr * given.microblock.aligned_floats 1108 * self.plan.microblocks_per_block() 1109 + mb_nr * given.microblock.aligned_floats 1110 + mb_el_nr * given.dofs_per_el())
1111 1112 def draw_dest(): 1113 mb_nr, mb_el_nr, face_nr = choice(faces) 1114 return (mb_nr * given.aligned_face_dofs_per_microblock() 1115 + mb_el_nr * face_dofs * given.faces_per_el() 1116 + face_nr * face_dofs) 1117 1118 def bound_int(low, x, hi): 1119 return int(min(max(low, x), hi)) 1120 1121 from random import gauss 1122 pdata = self.plan.partition_data 1123 fp_count = bound_int( 1124 0, 1125 gauss( 1126 pdata.face_pair_avg, 1127 (pdata.max_face_pair_count-pdata.face_pair_avg)/2), 1128 pdata.max_face_pair_count) 1129 1130 1131 for i in range(fp_count): 1132 fp_structs.append( 1133 fp_struct.make( 1134 h=0.5, order=2, face_jacobian=0.5, 1135 normal=discr.dimensions*[0.1], 1136 1137 a_base=draw_base(), b_base=draw_base(), 1138 1139 a_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs, 1140 b_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs, 1141 1142 boundary_bitmap=1, 1143 b_write_ilist_index=randrange(self.FAKE_INDEX_LIST_COUNT)*face_dofs, 1144 1145 a_dest=draw_dest(), b_dest=draw_dest() 1146 )) 1147 1148 total_ext_face_count = bound_int(0, 1149 pdata.ext_face_avg + randrange(-1,2), 1150 fp_count) 1151 1152 bdry_count = min(total_ext_face_count, 1153 randrange(1+int(round(total_ext_face_count/6)))) 1154 diff_count = total_ext_face_count-bdry_count 1155 1156 min_headers.append(fh_struct.make( 1157 same_facepairs_end=len(fp_structs)-total_ext_face_count, 1158 diff_facepairs_end=diff_count, 1159 bdry_facepairs_end=bdry_count)) 1160 min_fp_blocks.append(fp_structs) 1161 1162 dups = block_count//mp_count + 1 1163 headers = (min_headers * dups)[:block_count] 1164 fp_blocks = (min_fp_blocks * dups)[:block_count] 1165 1166 from codepy.cgen import Value 1167 from hedge.backends.cuda.tools import make_superblocks 1168 1169 return make_superblocks( 1170 given.devdata, "flux_data", 1171 [(headers, Value(fh_struct.tpname, "header")) ], 1172 [(fp_blocks, Value(fp_struct.tpname, "facepairs"))] 1173 ) 1174 1175 # index lists ------------------------------------------------------------- 1176 FAKE_INDEX_LIST_COUNT = 30 1177 1178 @memoize_method
1179 - def index_list_data(self):
1180 return self.index_list_backend(self.discr.index_lists)
1181 1182 @memoize_method
1183 - def fake_index_list_data(self):
1184 ilists = [] 1185 from random import shuffle 1186 for i in range(self.FAKE_INDEX_LIST_COUNT): 1187 ilist = range(self.plan.given.dofs_per_face()) 1188 shuffle(ilist) 1189 ilists.append(ilist) 1190 1191 return self.index_list_backend(ilists)
1192
1193 - def index_list_backend(self, ilists):
1194 from pytools import single_valued 1195 ilist_length = single_valued(len(il) for il in ilists) 1196 assert ilist_length == self.plan.given.dofs_per_face() 1197 1198 from codepy.cgen import Typedef, POD 1199 1200 from pytools import flatten 1201 flat_ilists_uncast = numpy.array(list(flatten(ilists))) 1202 1203 if numpy.max(flat_ilists_uncast) >= 256: 1204 tp = numpy.uint16 1205 else: 1206 tp = numpy.uint8 1207 1208 flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp) 1209 assert (flat_ilists == flat_ilists_uncast).all() 1210 1211 return GPUIndexLists( 1212 type=tp, 1213 code=[Typedef(POD(tp, "index_list_entry_t"))], 1214 device_memory=cuda.to_device(flat_ilists), 1215 bytes=flat_ilists.size*flat_ilists.itemsize, 1216 )
1217