1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24   
 25  import numpy 
 26  import hedge.backends.cuda.plan 
 27  from pytools import memoize_method 
 28  import pycuda.driver as cuda 
 29  import pycuda.gpuarray as gpuarray 
 30  import hedge.backends.cuda.plan 
 31  from pycuda.compiler import SourceModule 
 32  from hedge.backends.cuda.kernelbase import DiffKernelBase 
 33   
 34   
 35   
 36   
 37   
 38 -class ExecutionPlan(hedge.backends.cuda.plan.SMemFieldLocalOpExecutionPlan): 
  41   
 42      @memoize_method 
 44          given = self.given 
 45           
 46          return (64  
 47                 + given.float_size() * ( 
 48                     self.parallelism.parallel  
 49                     * self.parallelism.inline 
 50                     * self.given.microblock.aligned_floats)) 
  51   
 52      @staticmethod 
 54          return ("type text", 
 55                  "parallel integer",  
 56                  "inline integer",  
 57                  "serial integer",  
 58                  "segment_size integer",  
 59                  "max_unroll integer", 
 60                  "mb_elements integer", 
 61                  "lmem integer", 
 62                  "smem integer", 
 63                  "registers integer", 
 64                  "threads integer", 
 65                  ) 
  66   
 67 -    def features(self, lmem, smem, registers): 
  68          return ("smem_field", 
 69                  self.parallelism.parallel, 
 70                  self.parallelism.inline, 
 71                  self.parallelism.serial, 
 72                  None, 
 73                  self.max_unroll, 
 74                  self.given.microblock.elements, 
 75                  lmem, 
 76                  smem, 
 77                  registers, 
 78                  self.threads(), 
 79                  ) 
  80   
  83   
 84   
 85   
 86   
 87   
 88 -class Kernel(DiffKernelBase): 
  98   
121   
122          field = vol_empty() 
123          field.fill(0) 
124   
125          xyz_diff = [vol_empty() for axis in range(discr.dimensions)] 
126          xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff]  
127   
128          if "cuda_fastbench" in discr.debug: 
129              count = 1 
130          else: 
131              count = 20 
132   
133          start = cuda.Event() 
134          start.record() 
135          cuda.Context.synchronize() 
136          for i in range(count): 
137              try: 
138                  func.prepared_call(self.grid,  
139                          0,  
140                          field.gpudata, 
141                          *xyz_diff_gpudata) 
142              except cuda.LaunchError: 
143                  return None 
144   
145          stop = cuda.Event() 
146          stop.record() 
147          stop.synchronize() 
148   
149          return (1e-3/count * stop.time_since(start), 
150                  func.local_size_bytes, func.shared_size_bytes, func.num_regs) 
 151   
153          discr = self.discr 
154          given = self.plan.given 
155   
156          d = discr.dimensions 
157          elgroup, = discr.element_groups 
158   
159          func = self.get_kernel(op_class, elgroup) 
160   
161          assert field.dtype == given.float_type 
162   
163          use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug 
164          if use_debugbuf: 
165              debugbuf = gpuarray.zeros((512,), dtype=given.float_type) 
166          else: 
167              from hedge.backends.cuda.tools import FakeGPUArray 
168              debugbuf = FakeGPUArray() 
169   
170          xyz_diff = [discr.volume_empty() for axis in range(d)] 
171          xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff]  
172   
173          if discr.instrumented: 
174              discr.diff_op_timer.add_timer_callable( 
175                      func.prepared_timed_call(self.grid,  
176                          debugbuf.gpudata, field.gpudata, *xyz_diff_gpudata)) 
177   
178              block_gmem_floats = ( 
179                       
180                      given.microblock.aligned_floats  
181                      * discr.dimensions 
182                      * given.dofs_per_el() 
183                      * self.plan.parallelism.serial 
184                      * self.plan.parallelism.parallel 
185                       
186                      + given.microblock.aligned_floats 
187                      * self.plan.parallelism.total() 
188                      ) 
189   
190              gmem_bytes = given.float_size() * (  
191                      self.grid[0] * block_gmem_floats  
192                       
193                      + len(discr.nodes)) 
194   
195              discr.gmem_bytes_diff.add(gmem_bytes) 
196          else: 
197              func.prepared_call(self.grid,  
198                      debugbuf.gpudata, field.gpudata, *xyz_diff_gpudata) 
199   
200          if use_debugbuf: 
201              copied_debugbuf = debugbuf.get() 
202              print "DEBUG" 
203              print field.shape 
204               
205              print copied_debugbuf 
206              raw_input() 
207   
208          return xyz_diff 
 209   
210      @memoize_method 
211 -    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): 
 212          from codepy.cgen import \ 
213                  Pointer, POD, Value, ArrayOf, Const, \ 
214                  Module, FunctionDeclaration, FunctionBody, Block, \ 
215                  Comment, Line, Define, Include, \ 
216                  Initializer, If, For, Statement, Assign 
217   
218          from pycuda.tools import dtype_to_ctype 
219          from codepy.cgen.cuda import CudaShared, CudaGlobal 
220                   
221          discr = self.discr 
222          d = discr.dimensions 
223          dims = range(d) 
224          given = self.plan.given 
225   
226          elgroup, = discr.element_groups 
227          float_type = given.float_type 
228   
229          f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),  
230              [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ] 
231              + [Pointer(POD(float_type, "dxyz%d" % i)) for i in dims] 
232              )) 
233   
234          par = self.plan.parallelism 
235           
236          cmod = Module([ 
237                  Include("pycuda-helpers.hpp"), 
238                  Line(), 
239                  Value("texture<%s, 1, cudaReadModeElementType>" 
240                      % dtype_to_ctype(float_type, with_fp_tex_hack=True),  
241                      "rst_to_xyz_tex"), 
242                  ]) 
243   
244          if float_type == numpy.float64: 
245              cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",  
246                      "diff_rst_mat_tex")) 
247          elif float_type == numpy.float32: 
248              rst_channels = given.devdata.make_valid_tex_channel_count(d) 
249              cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"  
250                      % rst_channels, "diff_rst_mat_tex")) 
251          else: 
252              raise ValueError("unsupported float type: %s" % float_type) 
253   
254          cmod.extend([ 
255                  Line(), 
256                  Define("DIMENSIONS", discr.dimensions), 
257                  Define("DOFS_PER_EL", given.dofs_per_el()), 
258                  Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), 
259                  Define("ELS_PER_MB", given.microblock.elements), 
260                  Define("DOFS_PER_MB", "(DOFS_PER_EL*ELS_PER_MB)"), 
261                  Line(), 
262                  Define("CHUNK_SIZE", given.devdata.smem_granularity), 
263                  Define("CHUNK_DOF", "threadIdx.x"), 
264                  Define("PAR_MB_NR", "threadIdx.y"), 
265                  Define("CHUNK_NR", "threadIdx.z"), 
266                  Define("MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), 
267                  Define("EL_DOF", "(MB_DOF - mb_el*DOFS_PER_EL)"), 
268                  Line(), 
269                  Define("MACROBLOCK_NR", "blockIdx.x"), 
270                  Line(), 
271                  Define("PAR_MB_COUNT", par.parallel), 
272                  Define("INLINE_MB_COUNT", par.inline), 
273                  Define("SEQ_MB_COUNT", par.serial), 
274                  Line(), 
275                  Define("GLOBAL_MB_NR_BASE",  
276                      "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), 
277                  Define("GLOBAL_MB_NR",  
278                      "(GLOBAL_MB_NR_BASE" 
279                      "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), 
280                  Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), 
281                  Line(), 
282                  CudaShared( 
283                      ArrayOf( 
284                          ArrayOf( 
285                              ArrayOf( 
286                                  POD(float_type, "smem_field"),  
287                                  "PAR_MB_COUNT"), 
288                              "INLINE_MB_COUNT"), 
289                          "ALIGNED_DOFS_PER_MB")), 
290                  Line(), 
291                  ]) 
292   
293          S = Statement 
294          f_body = Block([ 
295              Initializer(Const(POD(numpy.uint16, "mb_el")), 
296                  "MB_DOF / DOFS_PER_EL"), 
297              Line(), 
298              ]) 
299               
300           
301          def get_scalar_diff_code(): 
302              code = [] 
303              for inl in range(par.inline): 
304                  for axis in dims: 
305                      code.append( 
306                          Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) 
307   
308              code.append(Line()) 
309   
310              tex_channels = ["x", "y", "z", "w"] 
311   
312              store_code = Block() 
313              for inl in range(par.inline): 
314                  for glob_axis in dims: 
315                      store_code.append(Assign( 
316                          "dxyz%d[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]"  
317                          % (glob_axis, inl), 
318                          " + ".join( 
319                              "fp_tex1Dfetch(rst_to_xyz_tex, %(loc_axis)d + " 
320                              "DIMENSIONS*(%(glob_axis)d + DIMENSIONS*(" 
321                              "(GLOBAL_MB_NR+%(inl)d)*ELS_PER_MB + mb_el)))"  
322                              "* d%(inl)drst%(loc_axis)d" % { 
323                                  "loc_axis": loc_axis,  
324                                  "glob_axis": glob_axis, 
325                                  "inl": inl 
326                              } 
327                              for loc_axis in dims 
328                              ) 
329                          )) 
330   
331              from hedge.backends.cuda.tools import unroll 
332              code.extend([ 
333                  Comment("everybody needs to be done with the old data"), 
334                  S("__syncthreads()"), 
335                  Line(), 
336                  ]+[ 
337                  Assign("smem_field[PAR_MB_NR][%d][MB_DOF]" % inl, 
338                      "field[(GLOBAL_MB_NR+%d)*ALIGNED_DOFS_PER_MB + MB_DOF]" % inl) 
339                  for inl in range(par.inline) 
340                  ]+[ 
341                  Line(), 
342                  Comment("all the new data must be loaded"), 
343                  S("__syncthreads()"), 
344                  Line(), 
345                  ]) 
346   
347              if float_type == numpy.float32: 
348                  code.append(Value("float%d" % rst_channels, "dmat_entries")) 
349   
350              code.extend([ 
351                  POD(float_type, "field_value%d" % inl) 
352                  for inl in range(par.inline) 
353                  ]+[Line()]) 
354   
355              def unroll_body(j): 
356                  result = [ 
357                      Assign("field_value%d" % inl,  
358                          "smem_field[PAR_MB_NR][%d][mb_el*DOFS_PER_EL+%s]" % (inl, j)) 
359                      for inl in range(par.inline) 
360                      ] 
361   
362                  if float_type == numpy.float32: 
363                      result.append(Assign("dmat_entries", 
364                          "tex1Dfetch(diff_rst_mat_tex, EL_DOF + %s*DOFS_PER_EL)" % j)) 
365                      result.extend( 
366                          S("d%drst%d += dmat_entries.%s * field_value%d"  
367                              % (inl, axis, tex_channels[axis], inl)) 
368                          for inl in range(par.inline) 
369                          for axis in dims) 
370                  elif float_type == numpy.float64: 
371                      result.extend( 
372                          S("d%(inl)drst%(axis)d += " 
373                              "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d + DIMENSIONS*(EL_DOF + %(j)d*DOFS_PER_EL))" 
374                              "* field_value%(inl)d" % { 
375                              "inl": inl, 
376                              "axis": axis, 
377                              "j": j 
378                              }) 
379                          for inl in range(par.inline) 
380                          for axis in dims) 
381                  else: 
382                      assert False 
383   
384                  return result 
 385   
386              code.append(If("MB_DOF < DOFS_PER_MB", Block(unroll(unroll_body, 
387                      total_number=given.dofs_per_el(),  
388                      max_unroll=self.plan.max_unroll) 
389                      +[store_code]))) 
390   
391              return code 
392   
393          f_body.extend([ 
394              For("unsigned short seq_mb_number = 0", 
395                  "seq_mb_number < SEQ_MB_COUNT", 
396                  "++seq_mb_number", 
397                  Block(get_scalar_diff_code()) 
398                  ) 
399              ]) 
400   
401           
402          cmod.append(FunctionBody(f_decl, f_body)) 
403   
404          if not for_benchmark and "cuda_dump_kernels" in discr.debug: 
405              open("diff.cu", "w").write(str(cmod)) 
406   
407          mod = SourceModule(cmod,  
408                  keep="cuda_keep_kernels" in discr.debug,  
409                   
410                  ) 
411   
412          func = mod.get_function("apply_diff_mat_smem") 
413   
414          if "cuda_diff" in discr.debug: 
415              print "diff: lmem=%d smem=%d regs=%d" % ( 
416                      func.local_size_bytes,  
417                      func.shared_size_bytes,  
418                      func.registers) 
419   
420          if for_benchmark: 
421              rst_to_xyz = self.fake_localop_rst_to_xyz() 
422          else: 
423              rst_to_xyz = self.localop_rst_to_xyz(diff_op_cls, elgroup) 
424   
425          rst_to_xyz_texref = mod.get_texref("rst_to_xyz_tex") 
426          rst_to_xyz.gpu_data.bind_to_texref_ext(rst_to_xyz_texref, 
427                  allow_double_hack=True) 
428   
429          diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") 
430          gpu_diffmats = self.gpu_diffmats(diff_op_cls, elgroup) 
431           
432          if given.float_type == numpy.float32: 
433              gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) 
434          elif given.float_type == numpy.float64: 
435              gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, 
436                      allow_double_hack=True) 
437          else: 
438              assert False 
439   
440          func.prepare( 
441                  ["PP"] + discr.dimensions*[float_type], 
442                  block=( 
443                      given.devdata.smem_granularity,  
444                      self.plan.parallelism.parallel,  
445                      given.microblock.aligned_floats//given.devdata.smem_granularity), 
446                  texrefs=[rst_to_xyz_texref, diff_rst_mat_texref]) 
447          return func 
448   
449       
450      @memoize_method 
452          discr = self.discr 
453          given = self.plan.given 
454          d = discr.dimensions 
455   
456          if given.float_type == numpy.float32: 
457              first_dim = given.devdata.make_valid_tex_channel_count(d) 
458          elif given.float_type == numpy.float64: 
459              first_dim = d 
460          else: 
461              assert False 
462   
463          result = numpy.zeros((first_dim, given.dofs_per_el(), given.dofs_per_el()), 
464                  dtype=given.float_type, order="F") 
465          for i, dm in enumerate(diff_op_cls.matrices(elgroup)): 
466              result[i] = dm 
467   
468          return gpuarray.to_gpu(result) 
 469