Package hedge :: Package cuda :: Module discretization
Source Code for Module hedge.cuda.discretization

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24  import numpy 
 25  import numpy.linalg as la 
 26  import hedge.discretization 
 27  import pycuda.driver as cuda 
 28  import pycuda.gpuarray as gpuarray 
 29  from pytools import memoize_method, memoize 
 30   
 31   
 32   
 33   
 34  # gpu data organization ------------------------------------------------------- 
 35 -class GPUBlock(object): 
 36      """Describes what data is local to each thread block on the GPU. 
 37   
 38      @ivar number: The global number of this block. 
 39      @ivar local_discretization: The L{hedge.element.Element} instance used 
 40        for elements in this block. 
 41      @ivar cpu_slices: A list of slices describing the CPU-side 
 42        storage locations for the block's elements. 
 43      @ivar microblocks: A list of lists of L{hedge.mesh.Element} instances, 
 44        each representing the elements in one block, and together representing 
 45        one block. 
 46      @ivar el_offsets_list: A lsit containing the offsets of the elements in 
 47        this block, in order. 
 48      @ivar el_number_map: A dictionary mapping L{hedge.mesh.Element} instances 
 49        to their number within this block. 
 50      """ 
 51      __slots__ = ["number", "local_discretization", "cpu_slices", "microblocks",  
 52              "el_offsets_list", "el_number_map", "el_number_map" 
 53              ] 
 54   
 55 -    def __init__(self, number, local_discretization, cpu_slices, microblocks,  
 56              el_offsets_list, el_number_map): 
 57          self.number = number 
 58          self.local_discretization = local_discretization 
 59          self.cpu_slices = cpu_slices 
 60          self.microblocks = microblocks 
 61          self.el_offsets_list = el_offsets_list 
 62          self.el_number_map = el_number_map 
 63   
 64   
 65   
 66   
 67   
 68 -class GPUFaceStorage(object): 
 69      """Describes where the dofs of an element face are stored. 
 70   
 71      @ivar opposite: the L{GPUFacestorage} instance for the face 
 72        oposite to this one. 
 73      """ 
 74      __slots__ = ["opposite"] 
 75   
 76 -    def __init__(self): 
 77          self.opposite = None 
 78   
 79 -    def set_opposite(self, opp): 
 80          if self.opposite is None: 
 81              self.opposite = opp 
 82          else: 
 83              assert self.opposite is opp 
 84   
 85 -class GPUInteriorFaceStorage(GPUFaceStorage): 
 86      """Describes storage locations for a face local to an element in a block. 
 87   
 88      @ivar el_face: a tuple C{(element, face_number)}. 
 89      @ivar cpu_slice: the base index of the element in CPU numbering. 
 90      @ivar native_index_list_id:  
 91      @ivar opp_write_index_list_id: 
 92      @ivar native_block: block in which element is to be found. 
 93      @ivar face_pair_side: 
 94      """ 
 95      __slots__ = [ 
 96              "el_face", "cpu_slice",  
 97              "native_index_list_id", "opp_write_index_list_id", 
 98              "global_int_flux_index_list_id", "global_ext_flux_index_list_id", 
 99              "native_block",  
100              "face_pair_side"] 
101   
102 -    def __init__(self, el_face, cpu_slice, native_index_list_id, 
103              native_block, face_pair_side): 
104          GPUFaceStorage.__init__(self) 
105          self.el_face = el_face 
106          self.cpu_slice = cpu_slice 
107          self.native_index_list_id = native_index_list_id 
108          self.native_block = native_block 
109          self.face_pair_side = face_pair_side 
110   
111 -class GPUBoundaryFaceStorage(GPUFaceStorage): 
112      """Describes storage locations for a boundary face. 
113   
114      @ivar cpu_bdry_index_in_floats: this face's starting index  
115        in the CPU-based TAG_ALL boundary array [floats]. 
116      @ivar gpu_bdry_index_in_floats: this face's starting index  
117        in the GPU-based TAG_ALL boundary array [floats]. 
118      @ivar face_pair_side: 
119      """ 
120      __slots__ = [ 
121              "cpu_bdry_index_in_floats",  
122              "gpu_bdry_index_in_floats",  
123              "face_pair_side", 
124              ] 
125   
126 -    def __init__(self,  
127              cpu_bdry_index_in_floats, 
128              gpu_bdry_index_in_floats, 
129              face_pair_side 
130              ): 
131          GPUFaceStorage.__init__(self) 
132          self.cpu_bdry_index_in_floats = cpu_bdry_index_in_floats 
133          self.gpu_bdry_index_in_floats = gpu_bdry_index_in_floats 
134          self.face_pair_side = face_pair_side 
135   
136   
137   
138   
139  @memoize 
140 -def _boundarize_kernel(): 
141      mod = cuda.SourceModule(""" 
142      texture<float, 1, cudaReadModeElementType> field_tex; 
143      __global__ void boundarize(float *bfield,  
144        unsigned int *to_indices, 
145        unsigned int *from_indices, 
146        unsigned int n) 
147      { 
148        int tid = threadIdx.x; 
149        int total_threads = gridDim.x*blockDim.x; 
150        int cta_start = blockDim.x*blockIdx.x; 
151        int i; 
152               
153        for (i = cta_start + tid; i < n; i += total_threads)  
154        { 
155          bfield[to_indices[i]] =  
156            tex1Dfetch(field_tex, from_indices[i]); 
157        } 
158      } 
159      """) 
160   
161      return (mod.get_function("boundarize"),  
162              mod.get_texref("field_tex")) 
163   
164   
165   
166   
167  # GPU discretization ---------------------------------------------------------- 
168 -class Discretization(hedge.discretization.Discretization): 
169 -    def _make_plan(self, ldis, mesh, float_type): 
170          from hedge.cuda.plan import \ 
171                  FluxExecutionPlan, \ 
172                  Parallelism, \ 
173                  optimize_plan 
174   
175          def generate_valid_plans(): 
176              for parallel_faces in range(1,32): 
177                  for mbs_per_block in range(1,256): 
178                      flux_plan = FluxExecutionPlan( 
179                              self.devdata, ldis, parallel_faces, 
180                              mbs_per_block, float_type=float_type) 
181                      if flux_plan.invalid_reason() is None: 
182                          yield flux_plan 
183   
184          return optimize_plan( 
185                  generate_valid_plans, 
186                  lambda plan: plan.elements_per_block() 
187                  ) 
188   
189   
190   
191   
192 -    def _partition_mesh(self, mesh, flux_plan): 
193          # search for mesh partition that matches plan 
194          from pymetis import part_graph 
195          orig_part_count = part_count = ( 
196                  len(mesh.elements)//flux_plan.elements_per_block()+1) 
197          while True: 
198              cuts, partition = part_graph(part_count, 
199                      mesh.element_adjacency_graph(), 
200                      vweights=[1000]*len(mesh.elements)) 
201   
202              # prepare a mapping:  block# -> # of external interfaces 
203              block2extifaces = dict((i, 0) for i in range(part_count)) 
204   
205              for (e1, f1), (e2, f2) in mesh.both_interfaces(): 
206                  b1 = partition[e1.id] 
207                  b2 = partition[e2.id] 
208   
209                  if b1 != b2: 
210                      block2extifaces[b1] += 1 
211   
212              for el, face_nbr in mesh.tag_to_boundary[hedge.mesh.TAG_ALL]: 
213                  b1 = partition[el.id] 
214                  block2extifaces[b1] += 1 
215   
216              blocks = dict((i, []) for i in range(part_count)) 
217              for el_id, block in enumerate(partition): 
218                  blocks[block].append(el_id) 
219              block_elements = max(len(block_els) for block_els in blocks.itervalues()) 
220   
221              from hedge.cuda.plan import Parallelism 
222              actual_plan = flux_plan.copy( 
223                      max_ext_faces=max(block2extifaces.itervalues()), 
224                      max_faces=max( 
225                          len(blocks[b])*flux_plan.faces_per_el() 
226                          + block2extifaces[b] 
227                          for b in range(len(blocks))), 
228                      ) 
229              assert actual_plan.max_faces % 2 == 0 
230   
231              if (block_elements <= actual_plan.elements_per_block() 
232                      and (flux_plan.occupancy_record().occupancy - 
233                          actual_plan.occupancy_record().occupancy) < 1e-10): 
234                  break 
235   
236              part_count += min(5, int(part_count*0.01)) 
237   
238          print "blocks: theoretical:%d practical:%d" % (orig_part_count, part_count) 
239   
240          if False: 
241              from matplotlib.pylab import hist, show 
242              print plan.get_extface_count() 
243              hist(block2extifaces.values()) 
244              show() 
245              hist([len(block_els) for block_els in blocks.itervalues()]) 
246              show() 
247               
248          return actual_plan, partition 
249   
250   
251   
252   
253 -    def __init__(self, mesh, local_discretization=None,  
254              order=None, flux_plan=None, init_cuda=True, debug=False,  
255              dev=None, default_scalar_type=numpy.float32): 
256          ldis = self.get_local_discretization(mesh, local_discretization, order) 
257   
258          if init_cuda: 
259              cuda.init() 
260   
261          if dev is None: 
262              assert cuda.Device.count() >= 1 
263              dev = cuda.Device(0) 
264          if isinstance(dev, int): 
265              dev = cuda.Device(dev) 
266          if init_cuda: 
267              self.cuda_context = dev.make_context() 
268   
269          self.device = dev 
270          from hedge.cuda.tools import DeviceData 
271          self.devdata = DeviceData(dev) 
272   
273          # make preliminary plan 
274          if flux_plan is None: 
275              flux_plan = self._make_plan(ldis, mesh, default_scalar_type) 
276          print "projected flux exec plan:", flux_plan 
277   
278          # partition mesh, obtain updated plan 
279          self.flux_plan, self.partition = self._partition_mesh(mesh, flux_plan) 
280          del flux_plan 
281          print "actual flux exec plan:", self.flux_plan 
282          print "actual local op exec plan:", self.flux_plan.diff_plan() 
283          print "actual flux local exec plan:", self.flux_plan.flux_lifting_plan() 
284   
285          # initialize superclass 
286          hedge.discretization.Discretization.__init__(self, mesh, ldis, debug=debug, 
287                  default_scalar_type=default_scalar_type) 
288   
289          # build our own data structures 
290          self.blocks = self._build_blocks() 
291          self.face_storage_map = self._build_face_storage_map() 
292   
293          # make a reference discretization 
294          from hedge.discr_precompiled import Discretization 
295          self.test_discr = Discretization(mesh, ldis) 
296   
297 -    def _build_blocks(self): 
298          block_el_numbers = {} 
299          for el_id, block in enumerate(self.partition): 
300              block_el_numbers.setdefault(block, []).append(el_id) 
301   
302          block_count = len(block_el_numbers) 
303   
304          def make_block(block_num): 
305              fplan = self.flux_plan 
306   
307              microblocks = [] 
308              current_microblock = [] 
309              el_offsets_list = [] 
310              el_number_map = {} 
311              elements = [self.mesh.elements[ben] for ben in block_el_numbers[block_num]] 
312              for block_el_nr, el in enumerate(elements): 
313                  el_offset = ( 
314                          len(microblocks)*fplan.microblock.aligned_floats 
315                          + len(current_microblock)*fplan.dofs_per_el()) 
316                  el_number_map[el] = block_el_nr 
317                  el_offsets_list.append(el_offset) 
318   
319                  current_microblock.append(el) 
320                  if len(current_microblock) == fplan.microblock.elements: 
321                      microblocks.append(current_microblock) 
322                      current_microblock = [] 
323   
324              if current_microblock: 
325                  microblocks.append(current_microblock) 
326   
327              assert len(microblocks) <= fplan.microblocks_per_block() 
328   
329              eg, = self.element_groups 
330              return GPUBlock(block_num,  
331                      local_discretization=eg.local_discretization, 
332                      cpu_slices=[self.find_el_range(el.id) for el in elements],  
333                      microblocks=microblocks, 
334                      el_offsets_list=el_offsets_list, 
335                      el_number_map=el_number_map) 
336   
337          return [make_block(block_num) for block_num in range(block_count)] 
338   
339       
340   
341   
342 -    def _build_face_storage_map(self): 
343          # Side effects: 
344          # - fill in GPUBlock.extfaces 
345          # - set self.aligned_boundary_floats 
346          fsm = {} 
347   
348          from hedge.tools import IndexListRegistry 
349          fil_registry = IndexListRegistry(self.debug) 
350   
351          def make_int_face(face_pair_side): 
352              el = self.mesh.elements[face_pair_side.element_id] 
353              elface = (el, face_pair_side.face_id) 
354   
355              block = self.blocks[self.partition[el.id]] 
356              iln = fil_registry.register( 
357                      (ldis, face_pair_side.face_id), 
358                      lambda: ldis.face_indices()[face_pair_side.face_id] 
359                      ) 
360              result = GPUInteriorFaceStorage( 
361                  elface,  
362                  cpu_slice=self.find_el_range(el.id),  
363                  native_index_list_id=iln, 
364                  native_block=block,  
365                  face_pair_side=face_pair_side 
366                  ) 
367   
368              assert elface not in fsm 
369              fsm[elface] = result 
370              return result 
371   
372   
373          def narrow_ilist(in_el_ilist, native_el_ilist): 
374              return get_read_from_map_from_permutation 
375   
376              el_dof_to_face_dof = dict( 
377                      (el_dof, i) 
378                      for i, el_dof in enumerate(native_el_ilist)) 
379              return tuple(el_dof_to_face_dof[el_dof] 
380                      for el_dof in in_el_ilist) 
381   
382          int_fg, = self.face_groups 
383          ldis = int_fg.ldis_loc 
384          assert ldis == int_fg.ldis_opp 
385   
386          id_face_index_list_number = fil_registry.register( 
387                  None,  
388                  lambda: tuple(xrange(ldis.face_node_count())) 
389                  ) 
390          assert id_face_index_list_number == 0 
391   
392          from pytools import single_valued 
393          for fp in int_fg.face_pairs: 
394              face1 = make_int_face(fp.loc) 
395              face2 = make_int_face(fp.opp) 
396              face1.opposite = face2 
397              face2.opposite = face1 
398   
399              def apply_write_map(wmap, sequence): 
400                  result = [None] * len(sequence) 
401                  for wm_i, seq_i in zip(wmap, sequence): 
402                      result[wm_i] = seq_i 
403                  assert None not in result 
404                  return tuple(result) 
405   
406              f_ind = ldis.face_indices() 
407   
408              def face1_in_el_ilist(): 
409                  return tuple(int_fg.index_lists[ 
410                      fp.loc.face_index_list_number]) 
411   
412              def face2_in_el_ilist():  
413                  return tuple(int_fg.index_lists[ 
414                      fp.opp.face_index_list_number]) 
415   
416              def opp_write_map():  
417                  return tuple( 
418                          int_fg.index_lists[fp.opp_native_write_map]) 
419                   
420              face1.global_int_flux_index_list_id = fil_registry.register( 
421                      (int_fg, fp.loc.face_index_list_number), 
422                      face1_in_el_ilist) 
423              face1.global_ext_flux_index_list_id = fil_registry.register( 
424                      (int_fg, fp.opp.face_index_list_number), 
425                      face2_in_el_ilist) 
426   
427              face2.global_int_flux_index_list_id = fil_registry.register( 
428                      (int_fg, fp.opp_native_write_map, 
429                          fp.opp.face_index_list_number), 
430                      lambda: apply_write_map( 
431                          opp_write_map(), face2_in_el_ilist()) 
432                      ) 
433              face2.global_ext_flux_index_list_id = fil_registry.register( 
434                      (int_fg, fp.opp_native_write_map, 
435                          fp.loc.face_index_list_number), 
436                      lambda: apply_write_map( 
437                          opp_write_map(), face1_in_el_ilist()) 
438                      ) 
439   
440              from pytools import get_write_to_map_from_permutation as gwtm 
441              #assert gwtm(face2_in_el_ilist, f_ind[fp.opp.face_id]) == opp_write_map 
442              face1.opp_write_index_list_id = fil_registry.register( 
443                      (int_fg, "wtm", fp.opp.face_index_list_number, 
444                          fp.opp.face_id), 
445                      lambda: gwtm(face2_in_el_ilist(), f_ind[fp.opp.face_id]) 
446                      ) 
447              face2.opp_write_index_list_id = fil_registry.register( 
448                      (int_fg, "wtm",  
449                          fp.opp_native_write_map, 
450                          fp.loc.face_index_list_number, 
451                          fp.loc.face_id), 
452                      lambda: gwtm( 
453                          apply_write_map(opp_write_map(), face1_in_el_ilist()), 
454                          f_ind[fp.loc.face_id]) 
455                      ) 
456   
457          self.aligned_boundary_floats = 0 
458          from hedge.mesh import TAG_ALL 
459          for bdry_fg in self.get_boundary(TAG_ALL).face_groups: 
460              assert ldis == bdry_fg.ldis_loc 
461              aligned_fnc = self.devdata.align_dtype(ldis.face_node_count(),  
462                      self.flux_plan.float_size) 
463              for fp in bdry_fg.face_pairs: 
464                  assert fp.opp.element_id == hedge._internal.INVALID_ELEMENT 
465                  #assert (tuple(bdry_fg.index_lists[fp.opp.face_index_list_number])  
466                          #== id_face_index_list) 
467   
468                  face1 = make_int_face(fp.loc) 
469                  face2 = GPUBoundaryFaceStorage( 
470                          fp.opp.el_base_index, 
471                          self.aligned_boundary_floats, 
472                          fp.opp 
473                          ) 
474                  self.aligned_boundary_floats += aligned_fnc 
475                  face1.opposite = face2 
476                  face2.opposite = face1 
477   
478                  face1.global_int_flux_index_list_id = fil_registry.register( 
479                          (bdry_fg,fp.loc.face_index_list_number), 
480                          lambda: tuple(bdry_fg.index_lists[ 
481                              fp.loc.face_index_list_number]) 
482                          ) 
483                  face1.global_ext_flux_index_list_id = fil_registry.register( 
484                          (bdry_fg, fp.opp.face_index_list_number), 
485                          lambda: tuple(bdry_fg.index_lists[ 
486                              fp.opp.face_index_list_number]) 
487                          ) 
488   
489          self.index_lists = fil_registry.index_lists 
490          return fsm 
491   
492   
493   
494   
495 -    def find_el_gpu_index(self, el): 
496          fplan = self.flux_plan 
497          block = self.blocks[self.partition[el.id]] 
498   
499          mb_nr, in_mb_nr = divmod(block.el_number_map[el], fplan.microblock.elements) 
500   
501          return (block.number * self.flux_plan.dofs_per_block()  
502                  + mb_nr*fplan.microblock.aligned_floats 
503                  + in_mb_nr*fplan.dofs_per_el()) 
504   
505 -    def find_number_in_block(self, el): 
506          block = self.blocks[self.partition[el.id]] 
507          return block.el_number_map[el] 
508   
509 -    def gpu_dof_count(self): 
510          from hedge.cuda.tools import int_ceiling 
511   
512          fplan = self.flux_plan 
513          return int_ceiling( 
514                  int_ceiling( 
515                      fplan.dofs_per_block() * len(self.blocks),      
516                      fplan.diff_plan().dofs_per_macroblock()), 
517                  fplan.flux_lifting_plan().dofs_per_macroblock()) 
518   
519 -    def volume_to_gpu(self, field): 
520          from hedge.tools import log_shape 
521          ls = log_shape(field) 
522          if ls != (): 
523              result = numpy.array(ls, dtype=object) 
524   
525              from pytools import indices_in_shape 
526   
527              for i in indices_in_shape(ls): 
528                  result[i] = self.volume_to_gpu(field[i]) 
529              return result 
530          else: 
531              copy_vec = numpy.empty((self.gpu_dof_count(),), dtype=numpy.float32) 
532   
533              block_offset = 0 
534              block_size = self.flux_plan.dofs_per_block() 
535              for block in self.blocks: 
536                  face_length = block.local_discretization.face_node_count() 
537                  el_length = block.local_discretization.node_count() 
538   
539                  for el_offset, cpu_slice in zip( 
540                          block.el_offsets_list, block.cpu_slices): 
541                      copy_vec[block_offset+el_offset:block_offset+el_offset+el_length] = \ 
542                              field[cpu_slice] 
543   
544                  block_offset += block_size 
545   
546              return gpuarray.to_gpu(copy_vec) 
547   
548 -    def volume_from_gpu(self, field): 
549          from hedge.tools import log_shape 
550          ls = log_shape(field) 
551          if ls != (): 
552              result = numpy.zeros(ls, dtype=object) 
553   
554              from pytools import indices_in_shape 
555   
556              for i in indices_in_shape(ls): 
557                  result[i] = self.volume_from_gpu(field[i]) 
558              return result 
559          else: 
560              copied_vec = field.get(pagelocked=True) 
561              result = numpy.empty(shape=(len(self),), dtype=copied_vec.dtype) 
562   
563              block_offset = 0 
564              block_size = self.flux_plan.dofs_per_block() 
565              for block in self.blocks: 
566                  el_length = block.local_discretization.node_count() 
567   
568                  for el_offset, cpu_slice in zip( 
569                          block.el_offsets_list, block.cpu_slices): 
570                      result[cpu_slice] = \ 
571                              copied_vec[block_offset+el_offset 
572                                      :block_offset+el_offset+el_length] 
573   
574                  block_offset += block_size 
575   
576              return result 
577   
578      @memoize_method 
579 -    def gpu_boundary_embedding(self, tag): 
580          """Return an array of indices embedding a CPU boundary 
581          field for C{tag} into the GPU boundary field.""" 
582   
583          bdry = self.get_boundary(tag) 
584          result = numpy.empty( 
585                  (len(bdry.nodes),), 
586                  dtype=numpy.intp) 
587          result.fill(-1) 
588   
589          cpu_base = 0 
590          for elface in self.mesh.tag_to_boundary.get(tag, []): 
591              face_stor = self.face_storage_map[elface] 
592              bdry_stor = face_stor.opposite 
593              assert isinstance(bdry_stor, GPUBoundaryFaceStorage) 
594   
595              face_len = (bdry_stor.opposite.native_block 
596                      .local_discretization.face_node_count()) 
597              gpu_base = bdry_stor.gpu_bdry_index_in_floats 
598              result[cpu_base:cpu_base+face_len] = \ 
599                      numpy.arange(gpu_base, gpu_base+face_len) 
600              cpu_base += face_len 
601   
602          assert (result>=0).all() 
603          return result 
604   
605 -    def boundary_to_gpu(self, tag, field): 
606          from hedge.tools import log_shape 
607          ls = log_shape(field) 
608          if ls != (): 
609              from pytools import indices_in_shape 
610              result = numpy.zeros(ls, dtype=object) 
611              for i in indices_in_shape(ls): 
612                  result[i] = self.boundary_to_gpu(tag, field[i]) 
613              return result 
614          else: 
615              result = cuda.pagelocked_empty( 
616                      (self.aligned_boundary_floats,), 
617                      dtype=field.dtype) 
618   
619              # The boundary cannot be completely uninitialized, 
620              # because it might contain NaNs. If a certain part of the 
621              # boundary is to be ignored, it is simply multiplied by 
622              # zero in the kernel, which won't make the NaNs disappear. 
623   
624              # Therefore, as a second best solution, fill the boundary 
625              # with a bogus value so that we can tell if it actually 
626              # enters the computation. 
627   
628              result.fill(17)  
629              result[self.gpu_boundary_embedding(tag)] = field 
630              return gpuarray.to_gpu(result) 
631   
632      # vector construction ----------------------------------------------------- 
633 -    def volume_empty(self, shape=(), dtype=None): 
634          if dtype is None: 
635              dtype = self.flux_plan.float_type 
636   
637          return gpuarray.empty(shape+(self.gpu_dof_count(),), dtype=dtype) 
638   
639 -    def volume_zeros(self, shape=()): 
640          result = self.volume_empty(shape) 
641          result.fill(0) 
642          return result 
643   
644 -    def interpolate_volume_function(self, f, dtype=None): 
645          s = hedge.discretization.Discretization 
646   
647          def tgt_factory(shape, dtype): 
648              return s.volume_empty(self, shape, dtype) 
649   
650          return self.volume_to_gpu( 
651                  s.interpolate_volume_function(self, f, tgt_factory)) 
652   
653 -    def _new_bdry(self, tag, shape, create_func, dtype): 
654          if dtype is None: 
655              dtype = self.default_scalar_type 
656   
657          if shape == (): 
658              return create_func((self.aligned_boundary_floats,), dtype=dtype) 
659   
660          result = numpy.empty(shape, dtype=object) 
661          from pytools import indices_in_shape 
662          bdry = self.get_boundary(TAG_ALL) 
663          for i in indices_in_shape(shape): 
664              result[i] = create_func((self.aligned_boundary_floats,), dtype=dtype) 
665          return result 
666       
667 -    def boundary_empty(self, tag=hedge.mesh.TAG_ALL, shape=(), dtype=None): 
668          return self._new_bdry(tag, shape, gpuarray.empty, dtype) 
669   
670 -    def boundary_zeros(self, tag=hedge.mesh.TAG_ALL, shape=(), dtype=None): 
671          return self._new_bdry(tag, shape, gpuarray.zeros, dtype) 
672   
673 -    def interpolate_boundary_function(self, f, tag=hedge.mesh.TAG_ALL): 
674          s = hedge.discretization.Discretization 
675          def tgt_factory(tag, shape, dtype): 
676              return s.boundary_zeros(self, tag, shape, dtype) 
677   
678          return self.boundary_to_gpu(tag, 
679                  s.interpolate_boundary_function(self, f, tag, tgt_factory)) 
680   
681 -    def boundary_normals(self, tag=hedge.mesh.TAG_ALL): 
682          s = hedge.discretization.Discretization 
683          def tgt_factory(tag, shape, dtype): 
684              return s.boundary_zeros(self, tag, shape, dtype) 
685   
686          return self.boundary_to_gpu(tag, 
687                  s.boundary_normals(self, tag, tgt_factory)) 
688   
689 -    def volumize_boundary_field(self, bfield, tag=hedge.mesh.TAG_ALL): 
690          raise NotImplementedError 
691   
692      @memoize_method 
693 -    def _boundarize_info(self, tag): 
694          from_indices = [] 
695          to_indices = [] 
696   
697          for elface in self.mesh.tag_to_boundary.get(tag, []): 
698              vol_face = self.face_storage_map[elface] 
699              bdry_face = vol_face.opposite 
700              assert isinstance(bdry_face, GPUBoundaryFaceStorage) 
701   
702              vol_el_index = \ 
703                      self.find_el_gpu_index(vol_face.el_face[0]) 
704              native_ilist = self.index_lists[vol_face.native_index_list_id] 
705              from_indices.extend(vol_el_index+i for i in native_ilist) 
706              bdry_index = bdry_face.gpu_bdry_index_in_floats 
707              to_indices.extend( 
708                      xrange(bdry_index, bdry_index+len(native_ilist))) 
709   
710          return ( 
711                  gpuarray.to_gpu( 
712                      numpy.array(from_indices, dtype=numpy.uint32)), 
713                  gpuarray.to_gpu( 
714                      numpy.array(to_indices, dtype=numpy.uint32)), 
715                  len(from_indices) 
716                  ) 
717   
718           
719 -    def boundarize_volume_field(self, field, tag=hedge.mesh.TAG_ALL): 
720          kernel, field_texref = _boundarize_kernel() 
721   
722          from_indices, to_indices, idx_count = self._boundarize_info(tag) 
723          block_count, threads_per_block, elems_per_block = \ 
724                  gpuarray.splay(idx_count) 
725   
726          def do_scalar(subfield): 
727              from hedge.mesh import TAG_ALL 
728              if tag != TAG_ALL: 
729                  result = self.boundary_zeros(tag) 
730              else: 
731                  result = self.boundary_empty(tag) 
732   
733              if idx_count: 
734                  subfield.bind_to_texref(field_texref) 
735                  kernel(result, to_indices, from_indices, 
736                          numpy.uint32(idx_count), 
737                          block=(threads_per_block,1,1), grid=(block_count,1), 
738                          texrefs=[field_texref]) 
739              return result 
740   
741          from hedge.tools import log_shape 
742          ls = log_shape(field) 
743   
744          if ls == (): 
745              return do_scalar(field) 
746          else: 
747              result = numpy.empty(ls, dtype=object) 
748              from pytools import indices_in_shape 
749              for i in indices_in_shape(ls): 
750                  result[i] = do_scalar(field[i]) 
751   
752              return result 
753   
754      # host vector construction ------------------------------------------------ 
755      s = hedge.discretization.Discretization 
756      host_volume_empty = s.volume_empty 
757      host_volume_zeros = s.volume_zeros  
758      del s 
759   
760      # optemplate processing --------------------------------------------------- 
761 -    def compile(self, optemplate): 
762          from hedge.cuda.execute import OpTemplateWithEnvironment 
763          return OpTemplateWithEnvironment(self, optemplate) 
764