1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24
25 import numpy
26 import hedge.backends.cuda.plan
27 from pytools import memoize_method
28 import pycuda.driver as cuda
29 import pycuda.gpuarray as gpuarray
30 import hedge.backends.cuda.plan
31 from pycuda.compiler import SourceModule
32 from hedge.backends.cuda.kernelbase import DiffKernelBase
33
34
35
36
37
38 -class ExecutionPlan(hedge.backends.cuda.plan.SMemFieldLocalOpExecutionPlan):
41
42 @memoize_method
44 given = self.given
45
46 return (64
47 + given.float_size() * (
48 self.parallelism.parallel
49 * self.parallelism.inline
50 * self.given.microblock.aligned_floats))
51
52 @staticmethod
54 return ("type text",
55 "parallel integer",
56 "inline integer",
57 "serial integer",
58 "segment_size integer",
59 "max_unroll integer",
60 "mb_elements integer",
61 "lmem integer",
62 "smem integer",
63 "registers integer",
64 "threads integer",
65 )
66
67 - def features(self, lmem, smem, registers):
68 return ("smem_field",
69 self.parallelism.parallel,
70 self.parallelism.inline,
71 self.parallelism.serial,
72 None,
73 self.max_unroll,
74 self.given.microblock.elements,
75 lmem,
76 smem,
77 registers,
78 self.threads(),
79 )
80
83
84
85
86
87
88 -class Kernel(DiffKernelBase):
98
121
122 field = vol_empty()
123 field.fill(0)
124
125 xyz_diff = [vol_empty() for axis in range(discr.dimensions)]
126 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff]
127
128 if "cuda_fastbench" in discr.debug:
129 count = 1
130 else:
131 count = 20
132
133 start = cuda.Event()
134 start.record()
135 cuda.Context.synchronize()
136 for i in range(count):
137 try:
138 func.prepared_call(self.grid,
139 0,
140 field.gpudata,
141 *xyz_diff_gpudata)
142 except cuda.LaunchError:
143 return None
144
145 stop = cuda.Event()
146 stop.record()
147 stop.synchronize()
148
149 return (1e-3/count * stop.time_since(start),
150 func.local_size_bytes, func.shared_size_bytes, func.num_regs)
151
153 discr = self.discr
154 given = self.plan.given
155
156 d = discr.dimensions
157 elgroup, = discr.element_groups
158
159 func = self.get_kernel(op_class, elgroup)
160
161 assert field.dtype == given.float_type
162
163 use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug
164 if use_debugbuf:
165 debugbuf = gpuarray.zeros((512,), dtype=given.float_type)
166 else:
167 from hedge.backends.cuda.tools import FakeGPUArray
168 debugbuf = FakeGPUArray()
169
170 xyz_diff = [discr.volume_empty() for axis in range(d)]
171 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff]
172
173 if discr.instrumented:
174 discr.diff_op_timer.add_timer_callable(
175 func.prepared_timed_call(self.grid,
176 debugbuf.gpudata, field.gpudata, *xyz_diff_gpudata))
177
178 block_gmem_floats = (
179
180 given.microblock.aligned_floats
181 * discr.dimensions
182 * given.dofs_per_el()
183 * self.plan.parallelism.serial
184 * self.plan.parallelism.parallel
185
186 + given.microblock.aligned_floats
187 * self.plan.parallelism.total()
188 )
189
190 gmem_bytes = given.float_size() * (
191 self.grid[0] * block_gmem_floats
192
193 + len(discr.nodes))
194
195 discr.gmem_bytes_diff.add(gmem_bytes)
196 else:
197 func.prepared_call(self.grid,
198 debugbuf.gpudata, field.gpudata, *xyz_diff_gpudata)
199
200 if use_debugbuf:
201 copied_debugbuf = debugbuf.get()
202 print "DEBUG"
203 print field.shape
204
205 print copied_debugbuf
206 raw_input()
207
208 return xyz_diff
209
210 @memoize_method
211 - def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
212 from codepy.cgen import \
213 Pointer, POD, Value, ArrayOf, Const, \
214 Module, FunctionDeclaration, FunctionBody, Block, \
215 Comment, Line, Define, Include, \
216 Initializer, If, For, Statement, Assign
217
218 from pycuda.tools import dtype_to_ctype
219 from codepy.cgen.cuda import CudaShared, CudaGlobal
220
221 discr = self.discr
222 d = discr.dimensions
223 dims = range(d)
224 given = self.plan.given
225
226 elgroup, = discr.element_groups
227 float_type = given.float_type
228
229 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),
230 [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ]
231 + [Pointer(POD(float_type, "dxyz%d" % i)) for i in dims]
232 ))
233
234 par = self.plan.parallelism
235
236 cmod = Module([
237 Include("pycuda-helpers.hpp"),
238 Line(),
239 Value("texture<%s, 1, cudaReadModeElementType>"
240 % dtype_to_ctype(float_type, with_fp_tex_hack=True),
241 "rst_to_xyz_tex"),
242 ])
243
244 if float_type == numpy.float64:
245 cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",
246 "diff_rst_mat_tex"))
247 elif float_type == numpy.float32:
248 rst_channels = given.devdata.make_valid_tex_channel_count(d)
249 cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"
250 % rst_channels, "diff_rst_mat_tex"))
251 else:
252 raise ValueError("unsupported float type: %s" % float_type)
253
254 cmod.extend([
255 Line(),
256 Define("DIMENSIONS", discr.dimensions),
257 Define("DOFS_PER_EL", given.dofs_per_el()),
258 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
259 Define("ELS_PER_MB", given.microblock.elements),
260 Define("DOFS_PER_MB", "(DOFS_PER_EL*ELS_PER_MB)"),
261 Line(),
262 Define("CHUNK_SIZE", given.devdata.smem_granularity),
263 Define("CHUNK_DOF", "threadIdx.x"),
264 Define("PAR_MB_NR", "threadIdx.y"),
265 Define("CHUNK_NR", "threadIdx.z"),
266 Define("MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
267 Define("EL_DOF", "(MB_DOF - mb_el*DOFS_PER_EL)"),
268 Line(),
269 Define("MACROBLOCK_NR", "blockIdx.x"),
270 Line(),
271 Define("PAR_MB_COUNT", par.parallel),
272 Define("INLINE_MB_COUNT", par.inline),
273 Define("SEQ_MB_COUNT", par.serial),
274 Line(),
275 Define("GLOBAL_MB_NR_BASE",
276 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
277 Define("GLOBAL_MB_NR",
278 "(GLOBAL_MB_NR_BASE"
279 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
280 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
281 Line(),
282 CudaShared(
283 ArrayOf(
284 ArrayOf(
285 ArrayOf(
286 POD(float_type, "smem_field"),
287 "PAR_MB_COUNT"),
288 "INLINE_MB_COUNT"),
289 "ALIGNED_DOFS_PER_MB")),
290 Line(),
291 ])
292
293 S = Statement
294 f_body = Block([
295 Initializer(Const(POD(numpy.uint16, "mb_el")),
296 "MB_DOF / DOFS_PER_EL"),
297 Line(),
298 ])
299
300
301 def get_scalar_diff_code():
302 code = []
303 for inl in range(par.inline):
304 for axis in dims:
305 code.append(
306 Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))
307
308 code.append(Line())
309
310 tex_channels = ["x", "y", "z", "w"]
311
312 store_code = Block()
313 for inl in range(par.inline):
314 for glob_axis in dims:
315 store_code.append(Assign(
316 "dxyz%d[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]"
317 % (glob_axis, inl),
318 " + ".join(
319 "fp_tex1Dfetch(rst_to_xyz_tex, %(loc_axis)d + "
320 "DIMENSIONS*(%(glob_axis)d + DIMENSIONS*("
321 "(GLOBAL_MB_NR+%(inl)d)*ELS_PER_MB + mb_el)))"
322 "* d%(inl)drst%(loc_axis)d" % {
323 "loc_axis": loc_axis,
324 "glob_axis": glob_axis,
325 "inl": inl
326 }
327 for loc_axis in dims
328 )
329 ))
330
331 from hedge.backends.cuda.tools import unroll
332 code.extend([
333 Comment("everybody needs to be done with the old data"),
334 S("__syncthreads()"),
335 Line(),
336 ]+[
337 Assign("smem_field[PAR_MB_NR][%d][MB_DOF]" % inl,
338 "field[(GLOBAL_MB_NR+%d)*ALIGNED_DOFS_PER_MB + MB_DOF]" % inl)
339 for inl in range(par.inline)
340 ]+[
341 Line(),
342 Comment("all the new data must be loaded"),
343 S("__syncthreads()"),
344 Line(),
345 ])
346
347 if float_type == numpy.float32:
348 code.append(Value("float%d" % rst_channels, "dmat_entries"))
349
350 code.extend([
351 POD(float_type, "field_value%d" % inl)
352 for inl in range(par.inline)
353 ]+[Line()])
354
355 def unroll_body(j):
356 result = [
357 Assign("field_value%d" % inl,
358 "smem_field[PAR_MB_NR][%d][mb_el*DOFS_PER_EL+%s]" % (inl, j))
359 for inl in range(par.inline)
360 ]
361
362 if float_type == numpy.float32:
363 result.append(Assign("dmat_entries",
364 "tex1Dfetch(diff_rst_mat_tex, EL_DOF + %s*DOFS_PER_EL)" % j))
365 result.extend(
366 S("d%drst%d += dmat_entries.%s * field_value%d"
367 % (inl, axis, tex_channels[axis], inl))
368 for inl in range(par.inline)
369 for axis in dims)
370 elif float_type == numpy.float64:
371 result.extend(
372 S("d%(inl)drst%(axis)d += "
373 "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d + DIMENSIONS*(EL_DOF + %(j)d*DOFS_PER_EL))"
374 "* field_value%(inl)d" % {
375 "inl": inl,
376 "axis": axis,
377 "j": j
378 })
379 for inl in range(par.inline)
380 for axis in dims)
381 else:
382 assert False
383
384 return result
385
386 code.append(If("MB_DOF < DOFS_PER_MB", Block(unroll(unroll_body,
387 total_number=given.dofs_per_el(),
388 max_unroll=self.plan.max_unroll)
389 +[store_code])))
390
391 return code
392
393 f_body.extend([
394 For("unsigned short seq_mb_number = 0",
395 "seq_mb_number < SEQ_MB_COUNT",
396 "++seq_mb_number",
397 Block(get_scalar_diff_code())
398 )
399 ])
400
401
402 cmod.append(FunctionBody(f_decl, f_body))
403
404 if not for_benchmark and "cuda_dump_kernels" in discr.debug:
405 open("diff.cu", "w").write(str(cmod))
406
407 mod = SourceModule(cmod,
408 keep="cuda_keep_kernels" in discr.debug,
409
410 )
411
412 func = mod.get_function("apply_diff_mat_smem")
413
414 if "cuda_diff" in discr.debug:
415 print "diff: lmem=%d smem=%d regs=%d" % (
416 func.local_size_bytes,
417 func.shared_size_bytes,
418 func.registers)
419
420 if for_benchmark:
421 rst_to_xyz = self.fake_localop_rst_to_xyz()
422 else:
423 rst_to_xyz = self.localop_rst_to_xyz(diff_op_cls, elgroup)
424
425 rst_to_xyz_texref = mod.get_texref("rst_to_xyz_tex")
426 rst_to_xyz.gpu_data.bind_to_texref_ext(rst_to_xyz_texref,
427 allow_double_hack=True)
428
429 diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
430 gpu_diffmats = self.gpu_diffmats(diff_op_cls, elgroup)
431
432 if given.float_type == numpy.float32:
433 gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
434 elif given.float_type == numpy.float64:
435 gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref,
436 allow_double_hack=True)
437 else:
438 assert False
439
440 func.prepare(
441 ["PP"] + discr.dimensions*[float_type],
442 block=(
443 given.devdata.smem_granularity,
444 self.plan.parallelism.parallel,
445 given.microblock.aligned_floats//given.devdata.smem_granularity),
446 texrefs=[rst_to_xyz_texref, diff_rst_mat_texref])
447 return func
448
449
450 @memoize_method
452 discr = self.discr
453 given = self.plan.given
454 d = discr.dimensions
455
456 if given.float_type == numpy.float32:
457 first_dim = given.devdata.make_valid_tex_channel_count(d)
458 elif given.float_type == numpy.float64:
459 first_dim = d
460 else:
461 assert False
462
463 result = numpy.zeros((first_dim, given.dofs_per_el(), given.dofs_per_el()),
464 dtype=given.float_type, order="F")
465 for i, dm in enumerate(diff_op_cls.matrices(elgroup)):
466 result[i] = dm
467
468 return gpuarray.to_gpu(result)
469