1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24  import numpy 
 25  import numpy.linalg as la 
 26  from pytools import memoize_method 
 27  import hedge.optemplate 
 28  from hedge.compiler import OperatorCompilerBase, \ 
 29          Assign, FluxBatchAssign 
 30  import pycuda.driver as cuda 
 31  import pymbolic.mapper.stringifier 
 32  from hedge.backends.exec_common import ExecutionMapperBase 
 33   
 34   
 35   
 36   
 37   
 38 -def get_vec_structure(vec, point_size, segment_size, block_size, 
 39          other_char=lambda snippet: "."): 
  40      """Prints a structured view of a vector--one character per `point_size` floats, 
 41      `segment_size` characters partitioned off by spaces, `block_size` segments 
 42      per line. 
 43   
 44      The caracter printed is either an 'N' if any NaNs are encountered, a zero 
 45      if the entire snippet is zero, or otherwise whatever `other_char` returns, 
 46      defaulting to a period. 
 47      """ 
 48   
 49      result = "" 
 50      for block in range(len(vec) // block_size): 
 51          struc = "" 
 52          for segment in range(block_size//segment_size): 
 53              for point in range(segment_size//point_size): 
 54                  offset = block*block_size + segment*segment_size + point*point_size 
 55                  snippet = vec[offset:offset+point_size] 
 56   
 57                  if numpy.isnan(snippet).any(): 
 58                      struc += "N" 
 59                  elif (snippet == 0).any(): 
 60                      struc += "0" 
 61                  else: 
 62                      struc += other_char(snippet) 
 63   
 64              struc += " " 
 65          result += struc + "\n" 
 66      return result 
  67   
 68   
 69   
 70   
 71 -def print_error_structure(discr, computed, reference, diff, 
 72          eventful_only=False, detail=True): 
  73      norm_ref = la.norm(reference) 
 74      struc_lines = [] 
 75   
 76      if norm_ref == 0: 
 77          norm_ref = 1 
 78   
 79      from hedge.tools import relative_error 
 80      numpy.set_printoptions(precision=2, linewidth=130, suppress=True) 
 81      for block in discr.blocks: 
 82          add_lines = [] 
 83          struc_line  = "%7d " % (block.number * discr.flux_plan.dofs_per_block()) 
 84          i_el = 0 
 85          eventful = False 
 86          for mb in block.microblocks: 
 87              for el in mb: 
 88                  s = discr.find_el_range(el.id) 
 89                  relerr = relative_error(la.norm(diff[s]), norm_ref) 
 90                  if relerr > 1e-4: 
 91                      eventful = True 
 92                      struc_line += "*" 
 93                      if detail: 
 94                          print "block %d, el %d, global el #%d, rel.l2err=%g" % ( 
 95                                  block.number, i_el, el.id, relerr) 
 96                          print computed[s] 
 97                          print reference[s] 
 98                          print diff[s] 
 99                          print diff[s]/norm_ref 
100                          print la.norm(diff[s]), norm_ref 
101                          raw_input() 
102                  elif numpy.isnan(diff[s]).any(): 
103                      eventful = True 
104                      struc_line += "N" 
105                      add_lines.append(str(diff[s])) 
106                       
107                      if detail: 
108                          print "block %d, el %d, global el #%d, rel.l2err=%g" % ( 
109                                  block.number, i_el, el.id, relerr) 
110                          print computed[s] 
111                          print reference[s] 
112                          print diff[s] 
113                          raw_input() 
114                  else: 
115                      if numpy.max(numpy.abs(reference[s])) == 0: 
116                          struc_line += "0" 
117                      else: 
118                          if False: 
119                              print "block %d, el %d, global el #%d, rel.l2err=%g" % ( 
120                                      block.number, i_el, el.id, relerr) 
121                              print computed[s] 
122                              print reference[s] 
123                              print diff[s] 
124                              raw_input() 
125                          struc_line += "." 
126                  i_el += 1 
127              struc_line += " " 
128          if (not eventful_only) or eventful: 
129              struc_lines.append(struc_line) 
130              if detail: 
131                  struc_lines.extend(add_lines) 
132      print 
133      print "\n".join(struc_lines) 
 134   
141          return [(insn.name, self(insn.expr))], [] 
 142   
144          if self.executor.discr.instrumented: 
145              def stats_callback(n, vec_expr, t_func): 
146                  self.executor.discr.vector_math_timer.add_timer_callable(t_func) 
147                  self.executor.discr.vector_math_flop_counter.add( 
148                          n*insn.flop_count()) 
149                  self.executor.discr.gmem_bytes_vector_math.add( 
150                          self.executor.discr.given.float_size() * n * 
151                          (len(vec_expr.vector_deps)+len(insn.exprs))) 
 152          else: 
153              stats_callback = None 
154   
155          if insn.flop_count() == 0: 
156              return [(name, self(expr)) 
157                  for name, expr in zip(insn.names, insn.exprs)], [] 
158          else: 
159              compiled = insn.compiled(self.executor) 
160              return zip(compiled.result_names(), 
161                      compiled(self, stats_callback)), [] 
164          field = self.rec(insn.field) 
165   
166          discr = self.executor.discr 
167          if discr.instrumented: 
168              discr.diff_counter.add(discr.dimensions) 
169              discr.diff_flop_counter.add(discr.dimensions*( 
170                  self.executor.diff_rst_flops + self.executor.diff_rescale_one_flops)) 
171   
172          xyz_diff = self.executor.diff_kernel(insn.op_class, field) 
173   
174          if set(["cuda_diff", "cuda_compare"]) <= discr.debug: 
175              field = self.rec(insn.field) 
176              f = discr.volume_from_gpu(field) 
177              assert not numpy.isnan(f).any(), "Initial field contained NaNs." 
178              cpu_xyz_diff = [discr.volume_from_gpu(xd) for xd in xyz_diff] 
179              dx = cpu_xyz_diff[0] 
180   
181              test_discr = discr.test_discr 
182              real_dx = test_discr.nabla[0].apply(f.astype(numpy.float64)) 
183               
184              diff = dx - real_dx 
185   
186              for i, xd in enumerate(cpu_xyz_diff): 
187                  if numpy.isnan(xd).any(): 
188                      self.print_error_structure(xd, xd, xd-xd, 
189                              eventful_only=False, detail=False) 
190                      assert False, "Resulting field %d contained NaNs." % i 
191               
192              from hedge.tools import relative_error 
193              rel_err_norm = relative_error(la.norm(diff), la.norm(real_dx)) 
194              print "diff", rel_err_norm 
195              if not (rel_err_norm < 5e-5): 
196                  self.print_error_structure(dx, real_dx, diff, 
197                          eventful_only=False, detail=False) 
198   
199              assert rel_err_norm < 5e-5 
200   
201          return [(name, xyz_diff[op.xyz_axis]) 
202                  for name, op in zip(insn.names, insn.operators)], [] 
 203           
204   
206          discr = self.executor.discr 
207   
208          kernel = insn.kernel(self.executor) 
209          all_fofs = kernel(self.rec, discr.fluxlocal_plan) 
210          elgroup, = discr.element_groups 
211   
212          result = [ 
213              (name, self.executor.fluxlocal_kernel( 
214                  fluxes_on_faces,  
215                  *self.executor.flux_local_data( 
216                      self.executor.fluxlocal_kernel, elgroup, wdflux.is_lift))) 
217              for name, wdflux, fluxes_on_faces in zip( 
218                  insn.names, insn.fluxes, all_fofs)] 
219   
220          if discr.instrumented: 
221              given = discr.given 
222   
223              flux_count = len(insn.fluxes) 
224              dep_count = len(kernel.interior_deps) 
225   
226              discr.gather_counter.add( 
227                      flux_count*dep_count) 
228              discr.gather_flop_counter.add( 
229                      flux_count 
230                      * given.dofs_per_face() 
231                      * given.faces_per_el() 
232                      * len(discr.mesh.elements) 
233                      * (1  
234                          + 2 *  
235                          3*dep_count  
236                          ) 
237                      ) 
238   
239              discr.lift_counter.add(flux_count) 
240              discr.lift_flop_counter.add(flux_count*self.executor.lift_flops) 
241   
242           
243          if discr.debug & set(["cuda_lift", "cuda_flux"]): 
244              fplan = discr.flux_plan 
245   
246              for fluxes_on_faces in all_fofs: 
247                  useful_size = (len(discr.blocks) 
248                          * given.aligned_face_dofs_per_microblock() 
249                          * fplan.microblocks_per_block()) 
250                  fof = fluxes_on_faces.get() 
251   
252                  fof = fof[:useful_size] 
253   
254                  have_used_nans = False 
255                  for i_b, block in enumerate(discr.blocks): 
256                      offset = i_b*(given.aligned_face_dofs_per_microblock() 
257                              *fplan.microblocks_per_block()) 
258                      size = (len(block.el_number_map) 
259                              *given.dofs_per_face() 
260                              *given.faces_per_el()) 
261                      if numpy.isnan(la.norm(fof[offset:offset+size])).any(): 
262                          have_used_nans = True 
263   
264                  if have_used_nans: 
265                      struc = ( given.dofs_per_face(), 
266                              given.dofs_per_face()*given.faces_per_el(), 
267                              given.aligned_face_dofs_per_microblock(), 
268                              ) 
269   
270                      print self.get_vec_structure(fof, *struc) 
271                      raise RuntimeError("Detected used NaNs in flux gather output.") 
272   
273                  assert not have_used_nans 
274   
275          if "cuda_lift" in discr.debug: 
276              cuda.Context.synchronize() 
277              print "NANCHECK" 
278               
279              for name in insn.names: 
280                  flux = self.context[name] 
281                  copied_flux = discr.convert_volume(flux, kind="numpy") 
282                  contains_nans = numpy.isnan(copied_flux).any() 
283                  if contains_nans: 
284                      print "examining", name 
285                      print_error_structure(discr, 
286                              copied_flux, copied_flux, copied_flux-copied_flux, 
287                              eventful_only=True) 
288                  assert not contains_nans, "Resulting flux contains NaNs." 
289   
290          return result, [] 
 291   
298   
304      __slots__ = ["compiled"] 
305   
308   
309      comment = "compiled" 
310   
311      @memoize_method 
 327   
329      @memoize_method 
331          deps = set() 
332          for wdflux in self.fluxes: 
333              deps |= set(wdflux.interior_deps) 
334              deps |= set(wdflux.boundary_deps) 
335   
336          dep_mapper = self.dep_mapper_factory() 
337   
338          from pytools import flatten 
339          return set(flatten(dep_mapper(dep) for dep in deps)) 
 340   
341      @memoize_method 
343          return executor.discr.flux_plan.make_kernel( 
344                  executor.discr, executor, self.fluxes) 
  345   
387   
392      exec_mapper_class = ExecutionMapper 
393   
394 -    def __init__(self, discr, optemplate, post_bind_mapper): 
 395          self.discr = discr 
396   
397          from hedge.tools import diff_rst_flops, diff_rescale_one_flops, \ 
398                  mass_flops, lift_flops 
399          self.diff_rst_flops = diff_rst_flops(discr) 
400          self.diff_rescale_one_flops = diff_rescale_one_flops(discr) 
401          self.mass_flops = mass_flops(discr) 
402          self.lift_flops = sum(lift_flops(fg) for fg in discr.face_groups) 
403   
404          optemplate_stage1 = self.prepare_optemplate_stage1( 
405                  optemplate, post_bind_mapper) 
406   
407           
408          from hedge.optemplate import BoundaryTagCollector 
409          self.boundary_tag_to_number = {} 
410          for btag in BoundaryTagCollector()(optemplate_stage1): 
411              self.boundary_tag_to_number.setdefault(btag,  
412                      len(self.boundary_tag_to_number)) 
413   
414          e2bb = self.elface_to_bdry_bitmap = {} 
415           
416          for btag, bdry_number in self.boundary_tag_to_number.iteritems(): 
417              bdry_bit = 1 << bdry_number 
418              for elface in discr.mesh.tag_to_boundary.get(btag, []): 
419                  e2bb[elface] = (e2bb.get(elface, 0) | bdry_bit) 
420   
421           
422          from struct import calcsize 
423          self.code = OperatorCompiler( 
424                  max_vectors_in_batch_expr=220 // calcsize("P") 
425                  )( 
426                  self.prepare_optemplate_stage2(discr.mesh, optemplate_stage1)) 
427   
428           
429          self.diff_kernel = self.discr.diff_plan.make_kernel(discr) 
430          self.fluxlocal_kernel = self.discr.fluxlocal_plan.make_kernel(discr, 
431                  with_index_check=False) 
 432   
433      @staticmethod 
444   
445      @staticmethod 
449   
450      @classmethod 
454   
455      @classmethod 
464   
467   
468       
470          return self.code.execute( 
471                  self.discr.exec_mapper_class(vars, self)) 
 472   
473       
474      @memoize_method 
486   
487      @memoize_method 
488 -    def mass_data(self, kernel, elgroup, op_class): 
  491