Package hedge :: Package cuda :: Module discretization
[frames] | no frames]

Source Code for Module hedge.cuda.discretization

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24  import numpy 
 25  import numpy.linalg as la 
 26  import hedge.discretization 
 27  import pycuda.driver as cuda 
 28  import pycuda.gpuarray as gpuarray 
 29  from pytools import memoize_method, memoize 
 30   
 31   
 32   
 33   
 34  # gpu data organization ------------------------------------------------------- 
35 -class GPUBlock(object):
36 """Describes what data is local to each thread block on the GPU. 37 38 @ivar number: The global number of this block. 39 @ivar local_discretization: The L{hedge.element.Element} instance used 40 for elements in this block. 41 @ivar cpu_slices: A list of slices describing the CPU-side 42 storage locations for the block's elements. 43 @ivar microblocks: A list of lists of L{hedge.mesh.Element} instances, 44 each representing the elements in one block, and together representing 45 one block. 46 @ivar el_offsets_list: A lsit containing the offsets of the elements in 47 this block, in order. 48 @ivar el_number_map: A dictionary mapping L{hedge.mesh.Element} instances 49 to their number within this block. 50 """ 51 __slots__ = ["number", "local_discretization", "cpu_slices", "microblocks", 52 "el_offsets_list", "el_number_map", "el_number_map" 53 ] 54
55 - def __init__(self, number, local_discretization, cpu_slices, microblocks, 56 el_offsets_list, el_number_map):
57 self.number = number 58 self.local_discretization = local_discretization 59 self.cpu_slices = cpu_slices 60 self.microblocks = microblocks 61 self.el_offsets_list = el_offsets_list 62 self.el_number_map = el_number_map
63 64 65 66 67
68 -class GPUFaceStorage(object):
69 """Describes where the dofs of an element face are stored. 70 71 @ivar opposite: the L{GPUFacestorage} instance for the face 72 oposite to this one. 73 """ 74 __slots__ = ["opposite"] 75
76 - def __init__(self):
77 self.opposite = None
78
79 - def set_opposite(self, opp):
80 if self.opposite is None: 81 self.opposite = opp 82 else: 83 assert self.opposite is opp
84
85 -class GPUInteriorFaceStorage(GPUFaceStorage):
86 """Describes storage locations for a face local to an element in a block. 87 88 @ivar el_face: a tuple C{(element, face_number)}. 89 @ivar cpu_slice: the base index of the element in CPU numbering. 90 @ivar native_index_list_id: 91 @ivar opp_write_index_list_id: 92 @ivar native_block: block in which element is to be found. 93 @ivar face_pair_side: 94 """ 95 __slots__ = [ 96 "el_face", "cpu_slice", 97 "native_index_list_id", "opp_write_index_list_id", 98 "global_int_flux_index_list_id", "global_ext_flux_index_list_id", 99 "native_block", 100 "face_pair_side"] 101
102 - def __init__(self, el_face, cpu_slice, native_index_list_id, 103 native_block, face_pair_side):
104 GPUFaceStorage.__init__(self) 105 self.el_face = el_face 106 self.cpu_slice = cpu_slice 107 self.native_index_list_id = native_index_list_id 108 self.native_block = native_block 109 self.face_pair_side = face_pair_side
110
111 -class GPUBoundaryFaceStorage(GPUFaceStorage):
112 """Describes storage locations for a boundary face. 113 114 @ivar cpu_bdry_index_in_floats: this face's starting index 115 in the CPU-based TAG_ALL boundary array [floats]. 116 @ivar gpu_bdry_index_in_floats: this face's starting index 117 in the GPU-based TAG_ALL boundary array [floats]. 118 @ivar face_pair_side: 119 """ 120 __slots__ = [ 121 "cpu_bdry_index_in_floats", 122 "gpu_bdry_index_in_floats", 123 "face_pair_side", 124 ] 125
126 - def __init__(self, 127 cpu_bdry_index_in_floats, 128 gpu_bdry_index_in_floats, 129 face_pair_side 130 ):
131 GPUFaceStorage.__init__(self) 132 self.cpu_bdry_index_in_floats = cpu_bdry_index_in_floats 133 self.gpu_bdry_index_in_floats = gpu_bdry_index_in_floats 134 self.face_pair_side = face_pair_side
135 136 137 138 139 @memoize
140 -def _boundarize_kernel():
141 mod = cuda.SourceModule(""" 142 texture<float, 1, cudaReadModeElementType> field_tex; 143 __global__ void boundarize(float *bfield, 144 unsigned int *to_indices, 145 unsigned int *from_indices, 146 unsigned int n) 147 { 148 int tid = threadIdx.x; 149 int total_threads = gridDim.x*blockDim.x; 150 int cta_start = blockDim.x*blockIdx.x; 151 int i; 152 153 for (i = cta_start + tid; i < n; i += total_threads) 154 { 155 bfield[to_indices[i]] = 156 tex1Dfetch(field_tex, from_indices[i]); 157 } 158 } 159 """) 160 161 return (mod.get_function("boundarize"), 162 mod.get_texref("field_tex"))
163 164 165 166 167 # GPU discretization ----------------------------------------------------------
168 -class Discretization(hedge.discretization.Discretization):
169 - def _make_plan(self, ldis, mesh, float_type):
170 from hedge.cuda.plan import \ 171 FluxExecutionPlan, \ 172 Parallelism, \ 173 optimize_plan 174 175 def generate_valid_plans(): 176 for parallel_faces in range(1,32): 177 for mbs_per_block in range(1,256): 178 flux_plan = FluxExecutionPlan( 179 self.devdata, ldis, parallel_faces, 180 mbs_per_block, float_type=float_type) 181 if flux_plan.invalid_reason() is None: 182 yield flux_plan
183 184 return optimize_plan( 185 generate_valid_plans, 186 lambda plan: plan.elements_per_block() 187 )
188 189 190 191
192 - def _partition_mesh(self, mesh, flux_plan):
193 # search for mesh partition that matches plan 194 from pymetis import part_graph 195 orig_part_count = part_count = ( 196 len(mesh.elements)//flux_plan.elements_per_block()+1) 197 while True: 198 cuts, partition = part_graph(part_count, 199 mesh.element_adjacency_graph(), 200 vweights=[1000]*len(mesh.elements)) 201 202 # prepare a mapping: block# -> # of external interfaces 203 block2extifaces = dict((i, 0) for i in range(part_count)) 204 205 for (e1, f1), (e2, f2) in mesh.both_interfaces(): 206 b1 = partition[e1.id] 207 b2 = partition[e2.id] 208 209 if b1 != b2: 210 block2extifaces[b1] += 1 211 212 for el, face_nbr in mesh.tag_to_boundary[hedge.mesh.TAG_ALL]: 213 b1 = partition[el.id] 214 block2extifaces[b1] += 1 215 216 blocks = dict((i, []) for i in range(part_count)) 217 for el_id, block in enumerate(partition): 218 blocks[block].append(el_id) 219 block_elements = max(len(block_els) for block_els in blocks.itervalues()) 220 221 from hedge.cuda.plan import Parallelism 222 actual_plan = flux_plan.copy( 223 max_ext_faces=max(block2extifaces.itervalues()), 224 max_faces=max( 225 len(blocks[b])*flux_plan.faces_per_el() 226 + block2extifaces[b] 227 for b in range(len(blocks))), 228 ) 229 assert actual_plan.max_faces % 2 == 0 230 231 if (block_elements <= actual_plan.elements_per_block() 232 and (flux_plan.occupancy_record().occupancy - 233 actual_plan.occupancy_record().occupancy) < 1e-10): 234 break 235 236 part_count += min(5, int(part_count*0.01)) 237 238 print "blocks: theoretical:%d practical:%d" % (orig_part_count, part_count) 239 240 if False: 241 from matplotlib.pylab import hist, show 242 print plan.get_extface_count() 243 hist(block2extifaces.values()) 244 show() 245 hist([len(block_els) for block_els in blocks.itervalues()]) 246 show() 247 248 return actual_plan, partition
249 250 251 252
253 - def __init__(self, mesh, local_discretization=None, 254 order=None, flux_plan=None, init_cuda=True, debug=False, 255 dev=None, default_scalar_type=numpy.float32):
256 ldis = self.get_local_discretization(mesh, local_discretization, order) 257 258 if init_cuda: 259 cuda.init() 260 261 if dev is None: 262 assert cuda.Device.count() >= 1 263 dev = cuda.Device(0) 264 if isinstance(dev, int): 265 dev = cuda.Device(dev) 266 if init_cuda: 267 self.cuda_context = dev.make_context() 268 269 self.device = dev 270 from hedge.cuda.tools import DeviceData 271 self.devdata = DeviceData(dev) 272 273 # make preliminary plan 274 if flux_plan is None: 275 flux_plan = self._make_plan(ldis, mesh, default_scalar_type) 276 print "projected flux exec plan:", flux_plan 277 278 # partition mesh, obtain updated plan 279 self.flux_plan, self.partition = self._partition_mesh(mesh, flux_plan) 280 del flux_plan 281 print "actual flux exec plan:", self.flux_plan 282 print "actual local op exec plan:", self.flux_plan.diff_plan() 283 print "actual flux local exec plan:", self.flux_plan.flux_lifting_plan() 284 285 # initialize superclass 286 hedge.discretization.Discretization.__init__(self, mesh, ldis, debug=debug, 287 default_scalar_type=default_scalar_type) 288 289 # build our own data structures 290 self.blocks = self._build_blocks() 291 self.face_storage_map = self._build_face_storage_map() 292 293 # make a reference discretization 294 from hedge.discr_precompiled import Discretization 295 self.test_discr = Discretization(mesh, ldis)
296
297 - def _build_blocks(self):
298 block_el_numbers = {} 299 for el_id, block in enumerate(self.partition): 300 block_el_numbers.setdefault(block, []).append(el_id) 301 302 block_count = len(block_el_numbers) 303 304 def make_block(block_num): 305 fplan = self.flux_plan 306 307 microblocks = [] 308 current_microblock = [] 309 el_offsets_list = [] 310 el_number_map = {} 311 elements = [self.mesh.elements[ben] for ben in block_el_numbers[block_num]] 312 for block_el_nr, el in enumerate(elements): 313 el_offset = ( 314 len(microblocks)*fplan.microblock.aligned_floats 315 + len(current_microblock)*fplan.dofs_per_el()) 316 el_number_map[el] = block_el_nr 317 el_offsets_list.append(el_offset) 318 319 current_microblock.append(el) 320 if len(current_microblock) == fplan.microblock.elements: 321 microblocks.append(current_microblock) 322 current_microblock = [] 323 324 if current_microblock: 325 microblocks.append(current_microblock) 326 327 assert len(microblocks) <= fplan.microblocks_per_block() 328 329 eg, = self.element_groups 330 return GPUBlock(block_num, 331 local_discretization=eg.local_discretization, 332 cpu_slices=[self.find_el_range(el.id) for el in elements], 333 microblocks=microblocks, 334 el_offsets_list=el_offsets_list, 335 el_number_map=el_number_map)
336 337 return [make_block(block_num) for block_num in range(block_count)] 338 339 340 341
342 - def _build_face_storage_map(self):
343 # Side effects: 344 # - fill in GPUBlock.extfaces 345 # - set self.aligned_boundary_floats 346 fsm = {} 347 348 from hedge.tools import IndexListRegistry 349 fil_registry = IndexListRegistry(self.debug) 350 351 def make_int_face(face_pair_side): 352 el = self.mesh.elements[face_pair_side.element_id] 353 elface = (el, face_pair_side.face_id) 354 355 block = self.blocks[self.partition[el.id]] 356 iln = fil_registry.register( 357 (ldis, face_pair_side.face_id), 358 lambda: ldis.face_indices()[face_pair_side.face_id] 359 ) 360 result = GPUInteriorFaceStorage( 361 elface, 362 cpu_slice=self.find_el_range(el.id), 363 native_index_list_id=iln, 364 native_block=block, 365 face_pair_side=face_pair_side 366 ) 367 368 assert elface not in fsm 369 fsm[elface] = result 370 return result
371 372 373 def narrow_ilist(in_el_ilist, native_el_ilist): 374 return get_read_from_map_from_permutation 375 376 el_dof_to_face_dof = dict( 377 (el_dof, i) 378 for i, el_dof in enumerate(native_el_ilist)) 379 return tuple(el_dof_to_face_dof[el_dof] 380 for el_dof in in_el_ilist) 381 382 int_fg, = self.face_groups 383 ldis = int_fg.ldis_loc 384 assert ldis == int_fg.ldis_opp 385 386 id_face_index_list_number = fil_registry.register( 387 None, 388 lambda: tuple(xrange(ldis.face_node_count())) 389 ) 390 assert id_face_index_list_number == 0 391 392 from pytools import single_valued 393 for fp in int_fg.face_pairs: 394 face1 = make_int_face(fp.loc) 395 face2 = make_int_face(fp.opp) 396 face1.opposite = face2 397 face2.opposite = face1 398 399 def apply_write_map(wmap, sequence): 400 result = [None] * len(sequence) 401 for wm_i, seq_i in zip(wmap, sequence): 402 result[wm_i] = seq_i 403 assert None not in result 404 return tuple(result) 405 406 f_ind = ldis.face_indices() 407 408 def face1_in_el_ilist(): 409 return tuple(int_fg.index_lists[ 410 fp.loc.face_index_list_number]) 411 412 def face2_in_el_ilist(): 413 return tuple(int_fg.index_lists[ 414 fp.opp.face_index_list_number]) 415 416 def opp_write_map(): 417 return tuple( 418 int_fg.index_lists[fp.opp_native_write_map]) 419 420 face1.global_int_flux_index_list_id = fil_registry.register( 421 (int_fg, fp.loc.face_index_list_number), 422 face1_in_el_ilist) 423 face1.global_ext_flux_index_list_id = fil_registry.register( 424 (int_fg, fp.opp.face_index_list_number), 425 face2_in_el_ilist) 426 427 face2.global_int_flux_index_list_id = fil_registry.register( 428 (int_fg, fp.opp_native_write_map, 429 fp.opp.face_index_list_number), 430 lambda: apply_write_map( 431 opp_write_map(), face2_in_el_ilist()) 432 ) 433 face2.global_ext_flux_index_list_id = fil_registry.register( 434 (int_fg, fp.opp_native_write_map, 435 fp.loc.face_index_list_number), 436 lambda: apply_write_map( 437 opp_write_map(), face1_in_el_ilist()) 438 ) 439 440 from pytools import get_write_to_map_from_permutation as gwtm 441 #assert gwtm(face2_in_el_ilist, f_ind[fp.opp.face_id]) == opp_write_map 442 face1.opp_write_index_list_id = fil_registry.register( 443 (int_fg, "wtm", fp.opp.face_index_list_number, 444 fp.opp.face_id), 445 lambda: gwtm(face2_in_el_ilist(), f_ind[fp.opp.face_id]) 446 ) 447 face2.opp_write_index_list_id = fil_registry.register( 448 (int_fg, "wtm", 449 fp.opp_native_write_map, 450 fp.loc.face_index_list_number, 451 fp.loc.face_id), 452 lambda: gwtm( 453 apply_write_map(opp_write_map(), face1_in_el_ilist()), 454 f_ind[fp.loc.face_id]) 455 ) 456 457 self.aligned_boundary_floats = 0 458 from hedge.mesh import TAG_ALL 459 for bdry_fg in self.get_boundary(TAG_ALL).face_groups: 460 assert ldis == bdry_fg.ldis_loc 461 aligned_fnc = self.devdata.align_dtype(ldis.face_node_count(), 462 self.flux_plan.float_size) 463 for fp in bdry_fg.face_pairs: 464 assert fp.opp.element_id == hedge._internal.INVALID_ELEMENT 465 #assert (tuple(bdry_fg.index_lists[fp.opp.face_index_list_number]) 466 #== id_face_index_list) 467 468 face1 = make_int_face(fp.loc) 469 face2 = GPUBoundaryFaceStorage( 470 fp.opp.el_base_index, 471 self.aligned_boundary_floats, 472 fp.opp 473 ) 474 self.aligned_boundary_floats += aligned_fnc 475 face1.opposite = face2 476 face2.opposite = face1 477 478 face1.global_int_flux_index_list_id = fil_registry.register( 479 (bdry_fg,fp.loc.face_index_list_number), 480 lambda: tuple(bdry_fg.index_lists[ 481 fp.loc.face_index_list_number]) 482 ) 483 face1.global_ext_flux_index_list_id = fil_registry.register( 484 (bdry_fg, fp.opp.face_index_list_number), 485 lambda: tuple(bdry_fg.index_lists[ 486 fp.opp.face_index_list_number]) 487 ) 488 489 self.index_lists = fil_registry.index_lists 490 return fsm 491 492 493 494
495 - def find_el_gpu_index(self, el):
496 fplan = self.flux_plan 497 block = self.blocks[self.partition[el.id]] 498 499 mb_nr, in_mb_nr = divmod(block.el_number_map[el], fplan.microblock.elements) 500 501 return (block.number * self.flux_plan.dofs_per_block() 502 + mb_nr*fplan.microblock.aligned_floats 503 + in_mb_nr*fplan.dofs_per_el())
504
505 - def find_number_in_block(self, el):
506 block = self.blocks[self.partition[el.id]] 507 return block.el_number_map[el]
508
509 - def gpu_dof_count(self):
510 from hedge.cuda.tools import int_ceiling 511 512 fplan = self.flux_plan 513 return int_ceiling( 514 int_ceiling( 515 fplan.dofs_per_block() * len(self.blocks), 516 fplan.diff_plan().dofs_per_macroblock()), 517 fplan.flux_lifting_plan().dofs_per_macroblock())
518
519 - def volume_to_gpu(self, field):
520 from hedge.tools import log_shape 521 ls = log_shape(field) 522 if ls != (): 523 result = numpy.array(ls, dtype=object) 524 525 from pytools import indices_in_shape 526 527 for i in indices_in_shape(ls): 528 result[i] = self.volume_to_gpu(field[i]) 529 return result 530 else: 531 copy_vec = numpy.empty((self.gpu_dof_count(),), dtype=numpy.float32) 532 533 block_offset = 0 534 block_size = self.flux_plan.dofs_per_block() 535 for block in self.blocks: 536 face_length = block.local_discretization.face_node_count() 537 el_length = block.local_discretization.node_count() 538 539 for el_offset, cpu_slice in zip( 540 block.el_offsets_list, block.cpu_slices): 541 copy_vec[block_offset+el_offset:block_offset+el_offset+el_length] = \ 542 field[cpu_slice] 543 544 block_offset += block_size 545 546 return gpuarray.to_gpu(copy_vec)
547
548 - def volume_from_gpu(self, field):
549 from hedge.tools import log_shape 550 ls = log_shape(field) 551 if ls != (): 552 result = numpy.zeros(ls, dtype=object) 553 554 from pytools import indices_in_shape 555 556 for i in indices_in_shape(ls): 557 result[i] = self.volume_from_gpu(field[i]) 558 return result 559 else: 560 copied_vec = field.get(pagelocked=True) 561 result = numpy.empty(shape=(len(self),), dtype=copied_vec.dtype) 562 563 block_offset = 0 564 block_size = self.flux_plan.dofs_per_block() 565 for block in self.blocks: 566 el_length = block.local_discretization.node_count() 567 568 for el_offset, cpu_slice in zip( 569 block.el_offsets_list, block.cpu_slices): 570 result[cpu_slice] = \ 571 copied_vec[block_offset+el_offset 572 :block_offset+el_offset+el_length] 573 574 block_offset += block_size 575 576 return result
577 578 @memoize_method
579 - def gpu_boundary_embedding(self, tag):
580 """Return an array of indices embedding a CPU boundary 581 field for C{tag} into the GPU boundary field.""" 582 583 bdry = self.get_boundary(tag) 584 result = numpy.empty( 585 (len(bdry.nodes),), 586 dtype=numpy.intp) 587 result.fill(-1) 588 589 cpu_base = 0 590 for elface in self.mesh.tag_to_boundary.get(tag, []): 591 face_stor = self.face_storage_map[elface] 592 bdry_stor = face_stor.opposite 593 assert isinstance(bdry_stor, GPUBoundaryFaceStorage) 594 595 face_len = (bdry_stor.opposite.native_block 596 .local_discretization.face_node_count()) 597 gpu_base = bdry_stor.gpu_bdry_index_in_floats 598 result[cpu_base:cpu_base+face_len] = \ 599 numpy.arange(gpu_base, gpu_base+face_len) 600 cpu_base += face_len 601 602 assert (result>=0).all() 603 return result
604
605 - def boundary_to_gpu(self, tag, field):
606 from hedge.tools import log_shape 607 ls = log_shape(field) 608 if ls != (): 609 from pytools import indices_in_shape 610 result = numpy.zeros(ls, dtype=object) 611 for i in indices_in_shape(ls): 612 result[i] = self.boundary_to_gpu(tag, field[i]) 613 return result 614 else: 615 result = cuda.pagelocked_empty( 616 (self.aligned_boundary_floats,), 617 dtype=field.dtype) 618 619 # The boundary cannot be completely uninitialized, 620 # because it might contain NaNs. If a certain part of the 621 # boundary is to be ignored, it is simply multiplied by 622 # zero in the kernel, which won't make the NaNs disappear. 623 624 # Therefore, as a second best solution, fill the boundary 625 # with a bogus value so that we can tell if it actually 626 # enters the computation. 627 628 result.fill(17) 629 result[self.gpu_boundary_embedding(tag)] = field 630 return gpuarray.to_gpu(result)
631 632 # vector construction -----------------------------------------------------
633 - def volume_empty(self, shape=(), dtype=None):
634 if dtype is None: 635 dtype = self.flux_plan.float_type 636 637 return gpuarray.empty(shape+(self.gpu_dof_count(),), dtype=dtype)
638
639 - def volume_zeros(self, shape=()):
640 result = self.volume_empty(shape) 641 result.fill(0) 642 return result
643
644 - def interpolate_volume_function(self, f, dtype=None):
645 s = hedge.discretization.Discretization 646 647 def tgt_factory(shape, dtype): 648 return s.volume_empty(self, shape, dtype)
649 650 return self.volume_to_gpu( 651 s.interpolate_volume_function(self, f, tgt_factory)) 652
653 - def _new_bdry(self, tag, shape, create_func, dtype):
654 if dtype is None: 655 dtype = self.default_scalar_type 656 657 if shape == (): 658 return create_func((self.aligned_boundary_floats,), dtype=dtype) 659 660 result = numpy.empty(shape, dtype=object) 661 from pytools import indices_in_shape 662 bdry = self.get_boundary(TAG_ALL) 663 for i in indices_in_shape(shape): 664 result[i] = create_func((self.aligned_boundary_floats,), dtype=dtype) 665 return result
666
667 - def boundary_empty(self, tag=hedge.mesh.TAG_ALL, shape=(), dtype=None):
668 return self._new_bdry(tag, shape, gpuarray.empty, dtype)
669
670 - def boundary_zeros(self, tag=hedge.mesh.TAG_ALL, shape=(), dtype=None):
671 return self._new_bdry(tag, shape, gpuarray.zeros, dtype)
672
673 - def interpolate_boundary_function(self, f, tag=hedge.mesh.TAG_ALL):
674 s = hedge.discretization.Discretization 675 def tgt_factory(tag, shape, dtype): 676 return s.boundary_zeros(self, tag, shape, dtype)
677 678 return self.boundary_to_gpu(tag, 679 s.interpolate_boundary_function(self, f, tag, tgt_factory)) 680
681 - def boundary_normals(self, tag=hedge.mesh.TAG_ALL):
682 s = hedge.discretization.Discretization 683 def tgt_factory(tag, shape, dtype): 684 return s.boundary_zeros(self, tag, shape, dtype)
685 686 return self.boundary_to_gpu(tag, 687 s.boundary_normals(self, tag, tgt_factory)) 688
689 - def volumize_boundary_field(self, bfield, tag=hedge.mesh.TAG_ALL):
690 raise NotImplementedError
691 692 @memoize_method
693 - def _boundarize_info(self, tag):
694 from_indices = [] 695 to_indices = [] 696 697 for elface in self.mesh.tag_to_boundary.get(tag, []): 698 vol_face = self.face_storage_map[elface] 699 bdry_face = vol_face.opposite 700 assert isinstance(bdry_face, GPUBoundaryFaceStorage) 701 702 vol_el_index = \ 703 self.find_el_gpu_index(vol_face.el_face[0]) 704 native_ilist = self.index_lists[vol_face.native_index_list_id] 705 from_indices.extend(vol_el_index+i for i in native_ilist) 706 bdry_index = bdry_face.gpu_bdry_index_in_floats 707 to_indices.extend( 708 xrange(bdry_index, bdry_index+len(native_ilist))) 709 710 return ( 711 gpuarray.to_gpu( 712 numpy.array(from_indices, dtype=numpy.uint32)), 713 gpuarray.to_gpu( 714 numpy.array(to_indices, dtype=numpy.uint32)), 715 len(from_indices) 716 )
717 718
719 - def boundarize_volume_field(self, field, tag=hedge.mesh.TAG_ALL):
720 kernel, field_texref = _boundarize_kernel() 721 722 from_indices, to_indices, idx_count = self._boundarize_info(tag) 723 block_count, threads_per_block, elems_per_block = \ 724 gpuarray.splay(idx_count) 725 726 def do_scalar(subfield): 727 from hedge.mesh import TAG_ALL 728 if tag != TAG_ALL: 729 result = self.boundary_zeros(tag) 730 else: 731 result = self.boundary_empty(tag) 732 733 if idx_count: 734 subfield.bind_to_texref(field_texref) 735 kernel(result, to_indices, from_indices, 736 numpy.uint32(idx_count), 737 block=(threads_per_block,1,1), grid=(block_count,1), 738 texrefs=[field_texref]) 739 return result
740 741 from hedge.tools import log_shape 742 ls = log_shape(field) 743 744 if ls == (): 745 return do_scalar(field) 746 else: 747 result = numpy.empty(ls, dtype=object) 748 from pytools import indices_in_shape 749 for i in indices_in_shape(ls): 750 result[i] = do_scalar(field[i]) 751 752 return result 753 754 # host vector construction ------------------------------------------------ 755 s = hedge.discretization.Discretization 756 host_volume_empty = s.volume_empty 757 host_volume_zeros = s.volume_zeros 758 del s 759 760 # optemplate processing ---------------------------------------------------
761 - def compile(self, optemplate):
762 from hedge.cuda.execute import OpTemplateWithEnvironment 763 return OpTemplateWithEnvironment(self, optemplate)
764