hedge.backends.cuda

1 from __future__ import division 2 3 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 4 5 __license__ = """ 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see U{http://www.gnu.org/licenses/}. 18 """ 19 20 21 22 import numpy 23 import hedge.discretization 24 import pycuda.driver as cuda 25 import pycuda.gpuarray as gpuarray 26 from pytools import memoize_method, Record 27 from hedge.tools import Future

28 29 30 31 32 # gpu data organization ------------------------------------------------------- 33 -class GPUBlock(object):

34 """Describes what data is local to each thread block on the GPU. 35 36 @ivar number: The global number of this block. 37 @ivar local_discretization: The L{hedge.element.Element} instance used 38 for elements in this block. 39 @ivar cpu_slices: A list of slices describing the CPU-side 40 storage locations for the block's elements. 41 @ivar microblocks: A list of lists of L{hedge.mesh.Element} instances, 42 each representing the elements in one block, and together representing 43 one block. 44 @ivar el_offsets_list: A lsit containing the offsets of the elements in 45 this block, in order. 46 @ivar el_number_map: A dictionary mapping L{hedge.mesh.Element} instances 47 to their number within this block. 48 """ 49 __slots__ = ["number", "local_discretization", "cpu_slices", "microblocks", 50 "el_offsets_list", "el_number_map", "el_number_map" 51 ] 52

53 - def __init__(self, number, local_discretization, cpu_slices, microblocks, 54 el_offsets_list, el_number_map):

55 self.number = number 56 self.local_discretization = local_discretization 57 self.cpu_slices = cpu_slices 58 self.microblocks = microblocks 59 self.el_offsets_list = el_offsets_list 60 self.el_number_map = el_number_map

61

62 63 64 65 66 -class GPUFaceStorage(object):

67 """Describes where the dofs of an element face are stored. 68 69 @ivar opposite: the L{GPUFaceStorage} instance for the face 70 oposite to this one. 71 """ 72 __slots__ = ["opposite"] 73

74 - def __init__(self):

75 self.opposite = None

76

77 - def set_opposite(self, opp):

78 if self.opposite is None: 79 self.opposite = opp 80 else: 81 assert self.opposite is opp

82

83 -class GPUInteriorFaceStorage(GPUFaceStorage):

84 """Describes storage locations for a face local to an element in a block. 85 86 @ivar el_face: a tuple C{(element, face_number)}. 87 @ivar cpu_slice: the base index of the element in CPU numbering. 88 @ivar native_index_list_id: 89 @ivar opp_write_index_list_id: 90 @ivar native_block: block in which element is to be found. 91 @ivar face_pair_side: 92 """ 93 __slots__ = [ 94 "el_face", "cpu_slice", 95 "native_index_list_id", "opp_write_index_list_id", 96 "global_int_flux_index_list_id", "global_ext_flux_index_list_id", 97 "native_block", 98 "face_pair_side"] 99

100 - def __init__(self, el_face, cpu_slice, native_index_list_id, 101 native_block, face_pair_side):

102 GPUFaceStorage.__init__(self) 103 self.el_face = el_face 104 self.cpu_slice = cpu_slice 105 self.native_index_list_id = native_index_list_id 106 self.native_block = native_block 107 self.face_pair_side = face_pair_side

108

109 -class GPUBoundaryFaceStorage(GPUFaceStorage):

110 """Describes storage locations for a boundary face. 111 112 @ivar cpu_bdry_index_in_floats: this face's starting index 113 in the CPU-based TAG_ALL boundary array [floats]. 114 @ivar gpu_bdry_index_in_floats: this face's starting index 115 in the GPU-based TAG_ALL boundary array [floats]. 116 @ivar face_pair_side: 117 """ 118 __slots__ = [ 119 "cpu_bdry_index_in_floats", 120 "gpu_bdry_index_in_floats", 121 "face_pair_side", 122 ] 123

124 - def __init__(self, 125 cpu_bdry_index_in_floats, 126 gpu_bdry_index_in_floats, 127 face_pair_side 128 ):

129 GPUFaceStorage.__init__(self) 130 self.cpu_bdry_index_in_floats = cpu_bdry_index_in_floats 131 self.gpu_bdry_index_in_floats = gpu_bdry_index_in_floats 132 self.face_pair_side = face_pair_side

133

134 135 136 137 # GPU mesh partition ---------------------------------------------------------- 138 -def make_gpu_partition_greedy(adjgraph, max_block_size):

139 140 def first(iterable): 141 it = iter(iterable) 142 try: 143 return it.next() 144 except StopIteration: 145 return None

146 147 def bfs(top_node): 148 queue = [top_node] 149 150 result = set() 151 152 def num_connections_to_result(node): 153 return sum(1 for rn in result if node in adjgraph[rn]) 154 155 from pytools import argmax2 156 157 while True: 158 curr_node_idx = argmax2((i, num_connections_to_result(qn)) 159 for i, qn in enumerate(queue)) 160 161 curr_node = queue.pop(curr_node_idx) 162 163 if curr_node in avail_nodes: 164 avail_nodes.remove(curr_node) 165 result.add(curr_node) 166 if len(result) == max_block_size: 167 return result, first(node for node in queue if node in avail_nodes) 168 169 queue.extend(adjgraph[curr_node]) 170 171 if not queue: 172 # ran out of nodes in immediate vicinity -- add new ones from elsewhere 173 if avail_nodes: 174 queue.append(iter(avail_nodes).next()) 175 else: 176 return result, None 177 178 avail_nodes = set(adjgraph.iterkeys()) 179 next_node = None 180 181 partition = [0]*len(adjgraph) 182 183 blocks = [] 184 while avail_nodes: 185 if next_node is None: 186 from pytools import argmax2 187 next_node = argmax2((node, len(adjgraph[node])) for node in avail_nodes) 188 189 block, next_node = list(bfs(next_node)) 190 191 for el in block: 192 partition[el] = len(blocks) 193 blocks.append(block) 194 195 return partition, blocks 196

197 198 199 200 -def make_gpu_partition_metis(adjgraph, max_block_size):

201 from pymetis import part_graph 202 203 orig_part_count = part_count = len(adjgraph)//max_block_size+1 204 205 attempt_count = 5 206 for attempt in range(attempt_count): 207 if part_count > 1: 208 cuts, partition = part_graph(part_count, 209 adjgraph, vweights=[1000]*len(adjgraph)) 210 else: 211 # metis bug workaround: 212 # metis returns ones (instead of zeros) if part_count == 1 213 partition = [0]*len(adjgraph) 214 215 blocks = [[] for i in range(part_count)] 216 for el_id, block in enumerate(partition): 217 blocks[block].append(el_id) 218 block_elements = max(len(block_els) for block_els in blocks) 219 220 if block_elements <= max_block_size: 221 return partition, blocks 222 223 part_count += min(5, int(part_count*0.01)) 224 225 from warnings import warn 226 227 warn("could not achieve Metis partition after %d attempts, falling back to greedy" 228 % attempt_count) 229 230 return make_gpu_partition_greedy(adjgraph, max_block_size)

231

232 233 234 235 236 237 238 239 240 241 242 243 # GPU discretization ---------------------------------------------------------- 244 -class Discretization(hedge.discretization.Discretization):

245 from hedge.backends.cuda.execute import ExecutionMapper \ 246 as exec_mapper_class 247 from hedge.backends.cuda.execute import Executor \ 248 as executor_class 249 250 @classmethod

251 - def all_debug_flags(cls):

252 return hedge.discretization.Discretization.all_debug_flags() | set([ 253 "cuda_ilist_generation", 254 "cuda_compare", 255 "cuda_diff", 256 "cuda_diff_plan", 257 "cuda_flux", 258 "cuda_lift", 259 "cuda_gather_plan", 260 "cuda_lift_plan", 261 "cuda_debugbuf", 262 "cuda_memory", 263 "cuda_dump_kernels", 264 "cuda_fastbench", 265 "cuda_no_microblock", 266 "cuda_no_smem_matrix", 267 "cuda_no_plan", 268 "cuda_no_plan_el_local", 269 "cuda_keep_kernels", 270 "cuda_try_no_microblock", 271 "cuda_plan_log", 272 "cuda_plan_no_progress", 273 ])

274

275 - class PartitionData(Record):

276 pass

277

278 - def _get_partition_data(self, max_block_size):

279 try: 280 return self.partition_cache[max_block_size] 281 except KeyError: 282 pass 283 284 try: 285 import pymetis 286 metis_available = True 287 except ImportError: 288 metis_available = False 289 290 if max_block_size >= 10 and metis_available: 291 partition, blocks = make_gpu_partition_metis( 292 self.mesh.element_adjacency_graph(), 293 max_block_size) 294 else: 295 partition, blocks = make_gpu_partition_greedy( 296 self.mesh.element_adjacency_graph(), 297 max_block_size) 298 299 # prepare a mapping: block# -> # of external interfaces 300 block2extifaces = dict((i, 0) for i in range(len(blocks))) 301 302 for (e1, f1), (e2, f2) in self.mesh.both_interfaces(): 303 b1 = partition[e1.id] 304 b2 = partition[e2.id] 305 306 if b1 != b2: 307 block2extifaces[b1] += 1 308 309 for el, face_nbr in self.mesh.tag_to_boundary[hedge.mesh.TAG_REALLY_ALL]: 310 b1 = partition[el.id] 311 block2extifaces[b1] += 1 312 313 eg, = self.element_groups 314 315 max_facepairs = 0 316 int_face_pair_count = 0 317 face_pair_count = 0 318 319 for b in range(len(blocks)): 320 b_ext_faces = block2extifaces[b] 321 b_int_faces = (len(blocks[b])*eg.local_discretization.face_count() 322 - b_ext_faces) 323 assert b_int_faces % 2 == 0 324 b_facepairs = b_int_faces//2 + b_ext_faces 325 326 int_face_pair_count += b_int_faces//2 327 max_facepairs = max(max_facepairs, b_facepairs) 328 face_pair_count += b_facepairs 329 330 from pytools import average 331 332 result = self.PartitionData( 333 partition=partition, 334 blocks=blocks, 335 max_face_pair_count=max_facepairs, 336 ext_face_avg=average( 337 block2extifaces.itervalues()), 338 int_face_pair_avg=int_face_pair_count/len(blocks), 339 face_pair_avg=face_pair_count/len(blocks), 340 ) 341 342 self.partition_cache[max_block_size] = result 343 return result

344

345 - def __init__(self, mesh, local_discretization=None, 346 order=None, init_cuda=True, debug=set(), 347 device=None, default_scalar_type=numpy.float32, 348 tune_for=None, run_context=None, 349 mpi_cuda_dev_filter=lambda dev: True):

350 """ 351 352 @arg tune_for: An optemplate for whose application this discretization's 353 flux plan will be tuned. 354 """ 355 356 if tune_for is None: 357 from warnings import warn 358 warn("You can achieve better performance if you pass an optemplate " 359 "in the tune_for= kwarg.") 360 361 # initialize superclass 362 ldis = self.get_local_discretization(mesh, local_discretization, order) 363 364 hedge.discretization.Discretization.__init__(self, mesh, ldis, debug=debug, 365 default_scalar_type=default_scalar_type) 366 367 # cuda init 368 if init_cuda: 369 cuda.init() 370 371 if device is None: 372 if run_context is None: 373 from pycuda.tools import get_default_device 374 device = get_default_device() 375 else: 376 from hedge.backends.cuda.tools import mpi_get_default_device 377 device = mpi_get_default_device(run_context.communicator, 378 dev_filter=mpi_cuda_dev_filter) 379 380 if isinstance(device, int): 381 device = cuda.Device(device) 382 if init_cuda: 383 self.cuda_context = device.make_context() 384 else: 385 self.cuda_context = None 386 387 self.device = device 388 from pycuda.tools import DeviceData 389 390 # initialize memory pool 391 if "cuda_memory" in self.debug: 392 from pycuda.tools import DebugMemoryPool 393 if run_context is not None and run_context.ranks > 1: 394 self.pool = DebugMemoryPool( 395 interactive=False, 396 logfile=open("rank-%d-mem.log" % run_context.rank, "w") 397 ) 398 else: 399 self.pool = DebugMemoryPool( 400 interactive=False, 401 logfile=open("mem.log", "w")) 402 else: 403 from pycuda.tools import DeviceMemoryPool 404 self.pool = DeviceMemoryPool() 405 406 from pycuda.tools import PageLockedMemoryPool 407 self.pagelocked_pool = PageLockedMemoryPool() 408 409 # generate flux plan 410 self.partition_cache = {} 411 412 from pytools import Record 413 class OverallPlan(Record):pass 414 415 def generate_overall_plans(): 416 if "cuda_no_microblock" in self.debug: 417 allow_mb_values = [False] 418 elif "cuda_try_no_microblock" in self.debug: 419 allow_mb_values = [True, False] 420 else: 421 allow_mb_values = [True] 422 423 for allow_mb in allow_mb_values: 424 from hedge.backends.cuda.plan import PlanGivenData 425 given = PlanGivenData( 426 DeviceData(device), ldis, 427 allow_microblocking=allow_mb, 428 float_type=default_scalar_type) 429 430 import hedge.backends.cuda.fluxgather as fluxgather 431 flux_plan, flux_time = fluxgather.make_plan(self, given, tune_for) 432 433 # partition mesh, obtain updated plan 434 pdata = self._get_partition_data( 435 flux_plan.elements_per_block()) 436 given.post_decomposition( 437 block_count=len(pdata.blocks), 438 microblocks_per_block=flux_plan.microblocks_per_block()) 439 440 # plan local operations 441 from hedge.backends.cuda.plan import \ 442 make_diff_plan, \ 443 make_element_local_plan 444 445 diff_plan, diff_time = make_diff_plan(self, given) 446 fluxlocal_plan, fluxlocal_time = make_element_local_plan( 447 self, given, 448 op_name="lift", 449 aligned_preimage_dofs_per_microblock= 450 given.aligned_face_dofs_per_microblock(), 451 preimage_dofs_per_el=given.face_dofs_per_el(), 452 with_index_check=False) 453 454 sys_size = flux_plan.flux_count 455 total_time = flux_time + sys_size*(diff_time+fluxlocal_time) 456 457 yield OverallPlan( 458 given=given, 459 flux_plan=flux_plan, 460 partition=pdata.partition, 461 diff_plan=diff_plan, 462 fluxlocal_plan=fluxlocal_plan), total_time

463 464 overall_plans = list(generate_overall_plans()) 465 466 for op, total_time in overall_plans: 467 print "microblocking=%s -> time=%g" % ( 468 op.given.microblock.elements != 1, total_time) 469 470 from pytools import argmin2 471 overall_plan = argmin2(overall_plans) 472 473 self.given = overall_plan.given 474 self.flux_plan = overall_plan.flux_plan 475 self.partition = overall_plan.partition 476 self.diff_plan = overall_plan.diff_plan 477 self.fluxlocal_plan = overall_plan.fluxlocal_plan 478 479 print "actual flux exec plan:", self.flux_plan 480 print "actual diff op exec plan:", self.diff_plan 481 print "actual flux local exec plan:", self.fluxlocal_plan 482 483 # build data structures 484 self.blocks = self._build_blocks() 485 self.face_storage_map = self._build_face_storage_map() 486 487 # make a CPU reference discretization 488 if "cuda_compare" in self.debug: 489 from hedge.backends.jit import Discretization 490 self.test_discr = Discretization(mesh, ldis) 491 492 self.stream_pool = []

493

494 - def close(self):

495 self.pool.stop_holding() 496 self.pagelocked_pool.stop_holding() 497 if self.cuda_context is not None: 498 self.cuda_context.pop()

499

500 - def _build_blocks(self):

501 block_el_numbers = {} 502 for el_id, block in enumerate(self.partition): 503 block_el_numbers.setdefault(block, []).append(el_id) 504 505 block_count = len(block_el_numbers) 506 507 def make_block(block_num): 508 given = self.given 509 510 microblocks = [] 511 current_microblock = [] 512 el_offsets_list = [] 513 el_number_map = {} 514 elements = [self.mesh.elements[ben] 515 for ben in block_el_numbers.get(block_num, [])] 516 for block_el_nr, el in enumerate(elements): 517 el_offset = ( 518 len(microblocks)*given.microblock.aligned_floats 519 + len(current_microblock)*given.dofs_per_el()) 520 el_number_map[el] = block_el_nr 521 el_offsets_list.append(el_offset) 522 523 current_microblock.append(el) 524 if len(current_microblock) == given.microblock.elements: 525 microblocks.append(current_microblock) 526 current_microblock = [] 527 528 if current_microblock: 529 microblocks.append(current_microblock) 530 531 assert len(microblocks) <= self.flux_plan.microblocks_per_block() 532 533 eg, = self.element_groups 534 return GPUBlock(block_num, 535 local_discretization=eg.local_discretization, 536 cpu_slices=[self.find_el_range(el.id) for el in elements], 537 microblocks=microblocks, 538 el_offsets_list=el_offsets_list, 539 el_number_map=el_number_map)

540 541 return [make_block(block_num) for block_num in range(block_count)] 542 543 544 545

546 - def _build_face_storage_map(self):

547 # Side effects: 548 # - fill in GPUBlock.extfaces 549 # - set self.aligned_boundary_floats 550 fsm = {} 551 552 from hedge.tools import IndexListRegistry 553 fil_registry = IndexListRegistry("cuda_ilist_generation" in self.debug) 554 555 def make_int_face(face_pair_side): 556 el = self.mesh.elements[face_pair_side.element_id] 557 elface = (el, face_pair_side.face_id) 558 559 block = self.blocks[self.partition[el.id]] 560 iln = fil_registry.register( 561 (ldis, face_pair_side.face_id), 562 lambda: ldis.face_indices()[face_pair_side.face_id] 563 ) 564 result = GPUInteriorFaceStorage( 565 elface, 566 cpu_slice=self.find_el_range(el.id), 567 native_index_list_id=iln, 568 native_block=block, 569 face_pair_side=face_pair_side 570 ) 571 572 assert elface not in fsm 573 fsm[elface] = result 574 return result

575 576 int_fg, = self.face_groups 577 ldis = int_fg.ldis_loc 578 assert ldis == int_fg.ldis_opp 579 580 id_face_index_list_number = fil_registry.register( 581 None, 582 lambda: tuple(xrange(ldis.face_node_count())) 583 ) 584 assert id_face_index_list_number == 0 585 586 for fp in int_fg.face_pairs: 587 face1 = make_int_face(fp.loc) 588 face2 = make_int_face(fp.opp) 589 face1.opposite = face2 590 face2.opposite = face1 591 592 def apply_write_map(wmap, sequence): 593 result = [None] * len(sequence) 594 for wm_i, seq_i in zip(wmap, sequence): 595 result[wm_i] = seq_i 596 assert None not in result 597 return tuple(result) 598 599 f_ind = ldis.face_indices() 600 601 def face1_in_el_ilist(): 602 return tuple(int_fg.index_lists[ 603 fp.loc.face_index_list_number]) 604 605 def face2_in_el_ilist(): 606 return tuple(int_fg.index_lists[ 607 fp.opp.face_index_list_number]) 608 609 def opp_write_map(): 610 return tuple( 611 int_fg.index_lists[fp.opp_native_write_map]) 612 613 face1.global_int_flux_index_list_id = fil_registry.register( 614 (int_fg, fp.loc.face_index_list_number), 615 face1_in_el_ilist) 616 face1.global_ext_flux_index_list_id = fil_registry.register( 617 (int_fg, fp.opp.face_index_list_number), 618 face2_in_el_ilist) 619 620 face2.global_int_flux_index_list_id = fil_registry.register( 621 (int_fg, fp.opp_native_write_map, 622 fp.opp.face_index_list_number), 623 lambda: apply_write_map( 624 opp_write_map(), face2_in_el_ilist()) 625 ) 626 face2.global_ext_flux_index_list_id = fil_registry.register( 627 (int_fg, fp.opp_native_write_map, 628 fp.loc.face_index_list_number), 629 lambda: apply_write_map( 630 opp_write_map(), face1_in_el_ilist()) 631 ) 632 633 from pytools import get_write_to_map_from_permutation as gwtm 634 #assert gwtm(face2_in_el_ilist, f_ind[fp.opp.face_id]) == opp_write_map 635 face1.opp_write_index_list_id = fil_registry.register( 636 (int_fg, "wtm", fp.opp.face_index_list_number, 637 fp.opp.face_id), 638 lambda: gwtm(face2_in_el_ilist(), f_ind[fp.opp.face_id]) 639 ) 640 face2.opp_write_index_list_id = fil_registry.register( 641 (int_fg, "wtm", 642 fp.opp_native_write_map, 643 fp.loc.face_index_list_number, 644 fp.loc.face_id), 645 lambda: gwtm( 646 apply_write_map(opp_write_map(), face1_in_el_ilist()), 647 f_ind[fp.loc.face_id]) 648 ) 649 650 self.aligned_boundary_floats = 0 651 from hedge.mesh import TAG_REALLY_ALL 652 653 for bdry_fg in self.get_boundary(TAG_REALLY_ALL).face_groups: 654 if bdry_fg.ldis_loc is None: 655 assert len(bdry_fg.face_pairs) == 0 656 continue 657 658 assert ldis == bdry_fg.ldis_loc 659 660 aligned_fnc = self.given.devdata.align_dtype(ldis.face_node_count(), 661 self.given.float_size()) 662 for fp in bdry_fg.face_pairs: 663 assert fp.opp.element_id == hedge._internal.INVALID_ELEMENT 664 #assert (tuple(bdry_fg.index_lists[fp.opp.face_index_list_number]) 665 #== id_face_index_list) 666 667 face1 = make_int_face(fp.loc) 668 face2 = GPUBoundaryFaceStorage( 669 fp.opp.el_base_index, 670 self.aligned_boundary_floats, 671 fp.opp 672 ) 673 self.aligned_boundary_floats += aligned_fnc 674 face1.opposite = face2 675 face2.opposite = face1 676 677 face1.global_int_flux_index_list_id = fil_registry.register( 678 (bdry_fg,fp.loc.face_index_list_number), 679 lambda: tuple(bdry_fg.index_lists[ 680 fp.loc.face_index_list_number]) 681 ) 682 face1.global_ext_flux_index_list_id = fil_registry.register( 683 (bdry_fg, fp.opp.face_index_list_number), 684 lambda: tuple(bdry_fg.index_lists[ 685 fp.opp.face_index_list_number]) 686 ) 687 688 self.index_lists = fil_registry.index_lists 689 return fsm 690 691 692 693 694 # stream pooling ---------------------------------------------------------- 695 # (stupid CUDA isn't smart enough to allocate streams without synchronizing. 696 # sigh.)

697 - def _get_stream(self):

698 if not self.stream_pool: 699 return cuda.Stream() 700 else: 701 return self.stream_pool.pop()

702

703 - def _release_stream(self, s):

704 self.stream_pool.append(s)

705 706 707 708 709 # instrumentation ---------------------------------------------------------

710 - def add_instrumentation(self, mgr):

711 mgr.set_constant("flux_plan", str(self.flux_plan)) 712 mgr.set_constant("diff_plan", str(self.diff_plan)) 713 mgr.set_constant("fluxlocal_plan", str(self.fluxlocal_plan)) 714 715 from pytools.log import EventCounter 716 717 self.gmem_bytes_gather = EventCounter("gmem_bytes_gather", 718 "Bytes of gmem traffic during gather") 719 self.gmem_bytes_el_local = EventCounter("gmem_bytes_el_local", 720 "Bytes of gmem traffic during element-local matrix application") 721 self.gmem_bytes_diff = EventCounter("gmem_bytes_diff", 722 "Bytes of gmem traffic during lift") 723 self.gmem_bytes_vector_math = EventCounter("gmem_bytes_vector_math", 724 "Bytes of gmem traffic during vector math") 725 self.gmem_bytes_rk4 = EventCounter("gmem_bytes_rk4", 726 "Bytes of gmem traffic during RK4") 727 728 mgr.add_quantity(self.gmem_bytes_gather) 729 mgr.add_quantity(self.gmem_bytes_el_local) 730 mgr.add_quantity(self.gmem_bytes_diff) 731 mgr.add_quantity(self.gmem_bytes_vector_math) 732 mgr.add_quantity(self.gmem_bytes_rk4) 733 734 hedge.discretization.Discretization.add_instrumentation(self, mgr)

735

736 - def create_op_timers(self):

737 from hedge.backends.cuda.tools import CallableCollectionTimer 738 739 self.flux_gather_timer = CallableCollectionTimer("t_gather", 740 "Time spent gathering fluxes") 741 self.el_local_timer = CallableCollectionTimer("t_el_local", 742 "Time spent applying element-local matrices (lift, mass)") 743 self.diff_op_timer = CallableCollectionTimer("t_diff", 744 "Time spent applying applying differentiation operators") 745 self.vector_math_timer = CallableCollectionTimer("t_vector_math", 746 "Time spent doing vector math") 747 748 return [self.flux_gather_timer, 749 self.el_local_timer, 750 self.diff_op_timer, 751 self.vector_math_timer ]

752 753 754 755 756 # utilities ---------------------------------------------------------------

757 - def find_el_gpu_index(self, el):

758 given = self.given 759 block = self.blocks[self.partition[el.id]] 760 761 mb_nr, in_mb_nr = divmod(block.el_number_map[el], given.microblock.elements) 762 763 return (block.number * self.flux_plan.dofs_per_block() 764 + mb_nr*given.microblock.aligned_floats 765 + in_mb_nr*given.dofs_per_el())

766

767 - def find_number_in_block(self, el):

768 block = self.blocks[self.partition[el.id]] 769 return block.el_number_map[el]

770 771 @memoize_method

772 - def gpu_dof_count(self):

773 from hedge.backends.cuda.tools import int_ceiling 774 775 fplan = self.flux_plan 776 return int_ceiling( 777 int_ceiling( 778 fplan.dofs_per_block() * len(self.blocks), 779 self.diff_plan.dofs_per_macroblock()), 780 self.fluxlocal_plan.dofs_per_macroblock())

781 782 @memoize_method

783 - def _gpu_volume_embedding(self):

784 result = numpy.zeros((len(self.nodes),), dtype=numpy.intp) 785 block_offset = 0 786 block_size = self.flux_plan.dofs_per_block() 787 for block in self.blocks: 788 el_length = block.local_discretization.node_count() 789 790 for el_offset, cpu_slice in zip( 791 block.el_offsets_list, block.cpu_slices): 792 result[cpu_slice] = \ 793 block_offset+el_offset+numpy.arange(el_length) 794 795 block_offset += block_size 796 797 assert (result <= self.gpu_dof_count()).all() 798 799 return result

800 801 @memoize_method

802 - def _meaningful_volume_indices(self):

803 return gpuarray.to_gpu( 804 numpy.asarray( 805 numpy.sort(self._gpu_volume_embedding()), 806 dtype=numpy.uint32), 807 allocator=self.pool.allocate)

808

809 - def _volume_to_gpu(self, field):

810 def f(subfld): 811 cpu_transfer = self.pagelocked_pool.allocate( 812 (self.gpu_dof_count(),), dtype=subfld.dtype) 813 814 cpu_transfer[self._gpu_volume_embedding()] = subfld 815 return gpuarray.to_gpu(cpu_transfer, allocator=self.pool.allocate)

816 817 from hedge.tools import with_object_array_or_scalar 818 return with_object_array_or_scalar(f, field) 819

820 - def _volume_from_gpu(self, field):

821 def f(subfld): 822 return subfld.get(pagelocked=True)[self._gpu_volume_embedding()]

823 824 from hedge.tools import with_object_array_or_scalar 825 return with_object_array_or_scalar(f, field) 826 827 @memoize_method

828 - def _gpu_boundary_embedding(self, tag):

829 """Return an array of indices embedding a CPU boundary 830 field for C{tag} into the GPU boundary field.""" 831 832 bdry = self.get_boundary(tag) 833 result = numpy.empty( 834 (len(bdry.nodes),), 835 dtype=numpy.intp) 836 result.fill(-1) 837 838 cpu_base = 0 839 for elface in self.mesh.tag_to_boundary.get(tag, []): 840 face_stor = self.face_storage_map[elface] 841 bdry_stor = face_stor.opposite 842 assert isinstance(bdry_stor, GPUBoundaryFaceStorage) 843 844 face_len = (bdry_stor.opposite.native_block 845 .local_discretization.face_node_count()) 846 gpu_base = bdry_stor.gpu_bdry_index_in_floats 847 result[cpu_base:cpu_base+face_len] = \ 848 numpy.arange(gpu_base, gpu_base+face_len) 849 cpu_base += face_len 850 851 assert (result>=0).all() 852 return result

853 854 @memoize_method

855 - def _gpu_boundary_embedding_on_gpu(self, tag):

856 return gpuarray.to_gpu( 857 numpy.asarray( 858 self._gpu_boundary_embedding(tag), 859 dtype=numpy.uint32))

860

861 - class _BoundaryToGPUFuture(Future):

862 - def __init__(self, discr, field, tag, read_map=None):

863 self.discr = discr 864 865 from hedge.tools import log_shape 866 867 ls = log_shape(field) 868 if ls == (): 869 field_list = [field] 870 n = 1 871 else: 872 field_list = field 873 n, = ls 874 875 one_field = field_list[0] 876 one_field_size = len(one_field) 877 878 if field.dtype == object: 879 self.buf = buf = discr.pagelocked_pool.allocate( 880 ls+one_field.shape, dtype=self.default_scalar_type) 881 for i, subf in enumerate(field): 882 buf[i, :] = subf 883 else: 884 assert field.flags.c_contiguous 885 buf = field 886 887 self.stream = discr._get_stream() 888 889 buf.shape = buf.size, 890 try: 891 self.buf_gpu = buf_gpu = gpuarray.to_gpu_async( 892 buf, discr.pool.allocate, self.stream) 893 except cuda.LogicError: 894 # buf is not pagelocked 895 self.buf_gpu = buf_gpu = gpuarray.to_gpu(buf, discr.pool.allocate) 896 897 from hedge.tools import make_obj_array 898 out = make_obj_array([ 899 discr.boundary_empty(tag) for i in range(n)]) 900 901 if one_field_size: 902 if read_map is None: 903 gpuarray.multi_put( 904 arrays=[ 905 buf_gpu[i*one_field_size:(i+1)*one_field_size] 906 for i in range(n)], 907 dest_indices=discr._gpu_boundary_embedding_on_gpu(tag), 908 out=out, stream=self.stream) 909 else: 910 gpuarray.multi_take_put( 911 arrays=[buf_gpu for i in range(n)], 912 dest_indices=discr._gpu_boundary_embedding_on_gpu(tag), 913 src_indices=read_map, 914 src_offsets=[i*one_field_size for i in range(n)], 915 out=out, stream=self.stream) 916 917 if ls == (): 918 self.result = out[0] 919 else: 920 self.result = out

921

922 - def is_ready(self):

923 return self.stream.is_done()

924

925 - def __call__(self):

926 self.stream.synchronize() 927 self.discr._release_stream(self.stream) 928 return self.result

929

930 - def _boundary_from_gpu(self, field, tag):

931 def f(field): 932 return field.get()[self._gpu_boundary_embedding(tag)]

933 934 from hedge.tools import with_object_array_or_scalar 935 return with_object_array_or_scalar(f, field) 936

937 - def convert_volume(self, field, kind):

938 orig_kind = self.get_kind(field) 939 940 if kind == "numpy" and orig_kind == "gpu": 941 return self._volume_from_gpu(field) 942 elif kind == "gpu" and orig_kind == "numpy": 943 return self._volume_to_gpu(field) 944 else: 945 return hedge.discretization.Discretization.convert_volume( 946 self, field, kind)

947

948 - def convert_boundary(self, field, tag, kind):

949 orig_kind = self.get_kind(field) 950 951 if kind == "numpy" and orig_kind == "gpu": 952 return self._boundary_from_gpu(field, tag) 953 elif kind == "gpu" and orig_kind == "numpy": 954 return self._BoundaryToGPUFuture(self, field, tag)() 955 else: 956 return hedge.discretization.Discretization.convert_boundary( 957 self, field, tag, kind)

958

959 - def convert_boundary_async(self, field, tag, kind, read_map=None):

960 orig_kind = self.get_kind(field) 961 962 if kind == "gpu" and orig_kind == "numpy": 963 return self._BoundaryToGPUFuture(self, field, tag, read_map) 964 else: 965 return hedge.discretization.Discretization.convert_boundary_async( 966 self, field, tag, kind, read_map)

967 968 # vector construction tools -----------------------------------------------

969 - def _empty_gpuarray(self, shape, dtype):

970 return gpuarray.empty(shape, dtype=dtype, 971 allocator=self.pool.allocate)

972

973 - def _zeros_gpuarray(self, shape, dtype):

974 result = gpuarray.empty(shape, dtype=dtype, 975 allocator=self.pool.allocate) 976 result.fill(0) 977 return result

978

979 - def _new_vec(self, shape, create_func, dtype, base_size):

980 if dtype is None: 981 dtype = self.default_scalar_type 982 983 if shape == (): 984 return create_func((base_size,), dtype=dtype) 985 986 result = numpy.empty(shape, dtype=object) 987 from pytools import indices_in_shape 988 for i in indices_in_shape(shape): 989 result[i] = create_func((base_size,), dtype=dtype) 990 return result

991 992 993 # vector construction ----------------------------------------------------- 994 compute_kind = "gpu" 995

996 - def get_kind(self, field):

997 if isinstance(field, gpuarray.GPUArray): 998 return "gpu" 999 1000 from hedge.tools import log_shape 1001 from pytools import indices_in_shape 1002 1003 first_field = field[iter(indices_in_shape(log_shape(field))).next()] 1004 if isinstance(first_field, numpy.ndarray): 1005 return "numpy" 1006 elif isinstance(first_field, gpuarray.GPUArray): 1007 return "gpu" 1008 else: 1009 raise TypeError, "invalid field kind"

1010

1011 - def volume_empty(self, shape=(), dtype=None, kind="gpu"):

1012 if kind != "gpu": 1013 return hedge.discretization.Discretization.volume_empty( 1014 self, shape, dtype, kind) 1015 1016 return self._new_vec(shape, self._empty_gpuarray, dtype, 1017 self.gpu_dof_count())

1018

1019 - def volume_zeros(self, shape=(), dtype=None, kind="gpu"):

1020 if kind != "gpu": 1021 return hedge.discretization.Discretization.volume_zeros( 1022 self, shape, dtype, kind) 1023 1024 return self._new_vec(shape, self._zeros_gpuarray, dtype, 1025 self.gpu_dof_count())

1026

1027 - def boundary_empty(self, tag, shape=(), dtype=None, kind="gpu"):

1028 if kind == "gpu": 1029 return self._new_vec(shape, self._empty_gpuarray, dtype, 1030 self.aligned_boundary_floats) 1031 elif kind == "numpy-mpi-recv": 1032 return self.pagelocked_pool.allocate( 1033 shape+(len(self.get_boundary(tag).nodes),), 1034 dtype) 1035 else: 1036 return hedge.discretization.Discretization.boundary_empty( 1037 self, tag, shape, dtype, kind)

1038 1039

1040 - def boundary_zeros(self, tag, shape=(), dtype=None, kind="gpu"):

1041 if kind == "gpu": 1042 return self._new_vec(shape, self._zeros_gpuarray, dtype, 1043 self.aligned_boundary_floats) 1044 elif kind == "numpy-mpi-recv": 1045 result = self.pagelocked_pool.allocate( 1046 shape+(len(self.get_boundary(tag).nodes),), 1047 dtype) 1048 result.fill(0) 1049 return result 1050 else: 1051 return hedge.discretization.Discretization.boundary_zeros( 1052 self, tag, shape, dtype, kind)

1053

1054 - def volumize_boundary_field(self, bfield, tag):

1055 if self.get_kind(bfield) != "gpu": 1056 return hedge.discretization.Discretization.volumize_boundary_field( 1057 self, bfield, tag) 1058 1059 raise NotImplementedError

1060 1061 @memoize_method

1062 - def _boundarize_info(self, tag):

1063 from_indices = [] 1064 to_indices = [] 1065 1066 for elface in self.mesh.tag_to_boundary.get(tag, []): 1067 vol_face = self.face_storage_map[elface] 1068 bdry_face = vol_face.opposite 1069 assert isinstance(bdry_face, GPUBoundaryFaceStorage) 1070 1071 vol_el_index = \ 1072 self.find_el_gpu_index(vol_face.el_face[0]) 1073 native_ilist = self.index_lists[vol_face.native_index_list_id] 1074 from_indices.extend(vol_el_index+i for i in native_ilist) 1075 bdry_index = bdry_face.gpu_bdry_index_in_floats 1076 to_indices.extend( 1077 xrange(bdry_index, bdry_index+len(native_ilist))) 1078 1079 return from_indices, to_indices

1080 1081 @memoize_method

1082 - def _gpu_boundarize_info(self, tag):

1083 from_indices, to_indices = self._boundarize_info(tag) 1084 return ( 1085 gpuarray.to_gpu( 1086 numpy.array(from_indices, dtype=numpy.uint32)), 1087 gpuarray.to_gpu( 1088 numpy.array(to_indices, dtype=numpy.uint32)), 1089 )

1090 1091 @memoize_method

1092 - def _numpy_boundarize_info(self, tag):

1093 from_indices, to_indices = self._boundarize_info(tag) 1094 1095 temp_tgt = numpy.zeros((self.aligned_boundary_floats,), dtype=numpy.int32) 1096 temp_tgt[to_indices] = from_indices 1097 1098 result = temp_tgt[self._gpu_boundary_embedding(tag)] 1099 1100 return gpuarray.to_gpu(result)

1101

1102 - class _BoundarizeGPUToNumpyFuture(Future):

1103 - def __init__(self, discr, field, log_shape, tag):

1104 self.discr = discr 1105 1106 base_size = len(discr.get_boundary(tag).nodes) 1107 self.result = result = discr.pagelocked_pool.allocate( 1108 log_shape+(base_size,), 1109 dtype=discr.default_scalar_type) 1110 self.result_gpu = gpuarray.empty((result.size,), result.dtype, 1111 allocator=discr.pool.allocate) 1112 1113 # FIXME Technically, we're missing a sync primitive here. 1114 # We would need to make sure that 'field' is completely 1115 # written, but CUDA doesn't quite allow that without a 1116 # full cuda.Context.synchronize() as of version 2.2. 1117 # For now we're ok omitting the sync primitive since in a 1118 # typical DG operator, what we're using here is not 1119 # computed just before--it's data from the last 1120 # timestep, and that's guaranteed to be there. 1121 1122 self.stream = discr._get_stream() 1123 gpuarray.multi_take(field, discr._numpy_boundarize_info(tag), 1124 [self.result_gpu[i*base_size:(i+1)*base_size] for i in range(len(field))], 1125 stream=self.stream) 1126 self.result_gpu.get_async(self.stream, result)

1127

1128 - def is_ready(self):

1129 return self.stream.is_done()

1130

1131 - def __call__(self):

1132 self.stream.synchronize() 1133 self.discr._release_stream(self.stream) 1134 return self.result

1135

1136 - def boundarize_volume_field(self, field, tag, kind=None):

1137 if self.get_kind(field) == self.compute_kind: 1138 from hedge.tools import log_shape 1139 ls = log_shape(field) 1140 1141 if kind is None or kind == self.compute_kind: 1142 # GPU -> GPU boundarize 1143 1144 from_indices, to_indices = \ 1145 self._gpu_boundarize_info(tag) 1146 kind = None 1147 1148 from hedge.mesh import TAG_ALL 1149 if tag != TAG_ALL: 1150 make_new = self.boundary_zeros 1151 else: 1152 make_new = self.boundary_empty 1153 1154 if ls != (): 1155 from pytools import single_valued 1156 out = result = make_new(tag, shape=ls, 1157 dtype=single_valued(f.dtype for f in field)) 1158 src = field 1159 else: 1160 result = make_new(tag, dtype=field.dtype) 1161 out = [result] 1162 src = [field] 1163 1164 gpuarray.multi_take_put(src, 1165 to_indices, from_indices, out=out) 1166 if kind is None: 1167 return result 1168 else: 1169 return self.convert_boundary(result, tag, kind) 1170 1171 elif kind == "numpy": 1172 # GPU -> CPU boundarize, for MPI 1173 1174 if ls == (): 1175 field = [field] 1176 1177 return self._BoundarizeGPUToNumpyFuture(self, field, ls, tag)() 1178 else: 1179 raise ValueError("invalid target boundary kind: %s" % kind) 1180 else: 1181 return hedge.discretization.Discretization.boundarize_volume_field( 1182 self, field, tag, kind)

1183

1184 - def boundarize_volume_field_async(self, field, tag, kind=None):

1185 if self.get_kind(field) == self.compute_kind and kind == "numpy": 1186 from hedge.tools import log_shape 1187 ls = log_shape(field) 1188 if ls == (): 1189 field = [field] 1190 1191 return self._BoundarizeGPUToNumpyFuture(self, field, ls, tag) 1192 else: 1193 return hedge.discretization.Discretization\ 1194 .boundarize_volume_field_async(self, field, tag, kind)

1195

1196 - def prepare_from_neighbor_map(self, indices):

1197 return gpuarray.to_gpu(numpy.asarray(indices, dtype=numpy.uint32))

1198 1199 # ancillary kernel planning/construction ---------------------------------- 1200 @memoize_method

1201 - def element_local_kernel(self):

1202 from hedge.backends.cuda.plan import make_element_local_plan 1203 el_local_plan, _ = make_element_local_plan( 1204 self, self.given, 1205 op_name="el_local", 1206 aligned_preimage_dofs_per_microblock= 1207 self.given.microblock.aligned_floats, 1208 preimage_dofs_per_el=self.given.dofs_per_el(), 1209 with_index_check=True) 1210 return el_local_plan.make_kernel(self, with_index_check=True)

1211 1212 # scalar reduction --------------------------------------------------------

1213 - def nodewise_dot_product(self, a, b):

1214 return gpuarray.subset_dot( 1215 self._meaningful_volume_indices(), 1216 a, b, dtype=numpy.float64).get()

1217 1218 # numbering tools --------------------------------------------------------- 1219 @memoize_method

1220 - def elgroup_microblock_indices(self, elgroup):

1221 """For a given L{hedge.discretization._ElementGroup} instance 1222 C{elgroup}, return an index array (of dtype C{numpy.intp}) that, 1223 indexed by the block-microblock element number, gives the element 1224 number within C{elgroup}. 1225 """ 1226 1227 def get_el_index_in_el_group(el): 1228 mygroup, idx = self.group_map[el.id] 1229 assert mygroup is elgroup 1230 return idx

1231 1232 given = self.given 1233 1234 el_count = len(self.blocks) * given.elements_per_block() 1235 elgroup_indices = numpy.zeros((el_count,), dtype=numpy.intp) 1236 1237 for block in self.blocks: 1238 block_elgroup_indices = [ get_el_index_in_el_group(el) 1239 for mb in block.microblocks 1240 for el in mb] 1241 offset = block.number * given.elements_per_block() 1242 elgroup_indices[offset:offset+len(block_elgroup_indices)] = \ 1243 block_elgroup_indices 1244 1245 return elgroup_indices 1246

1247 1248 1249 1250 -def make_block_visualization(discr):

1251 result = discr.volume_zeros(kind="numpy") 1252 for block in discr.blocks: 1253 for cpu_slice in block.cpu_slices: 1254 result[cpu_slice] = block.number 1255 1256 return result

1257

Source Code for Package hedge.backends.cuda