1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24
25 import numpy
26 from pytools import memoize_method
27 import pycuda.driver as cuda
28 from pycuda.compiler import SourceModule
29 import hedge.backends.cuda.plan
30 from hedge.backends.cuda.kernelbase import DiffKernelBase
31
32
33
34
35
36 -class ExecutionPlan(hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan):
39
41 return 16 + 4 * (self.parallelism.inline-1)
42
45
46 @staticmethod
48 return ("type text",
49 "parallel integer",
50 "inline integer",
51 "serial integer",
52 "segment_size integer",
53 "max_unroll integer",
54 "mb_elements integer",
55 "lmem integer",
56 "smem integer",
57 "registers integer",
58 "threads integer",
59 )
60
61 - def features(self, lmem, smem, registers):
62 return ("smem_matrix",
63 self.parallelism.parallel,
64 self.parallelism.inline,
65 self.parallelism.serial,
66 self.segment_size,
67 self.max_unroll,
68 self.given.microblock.elements,
69 lmem,
70 smem,
71 registers,
72 self.threads(),
73 )
74
77
78
79
80
81
82
83 -class Kernel(DiffKernelBase):
93
114
115 field = vol_empty()
116 field.fill(0)
117
118 field.bind_to_texref_ext(field_texref, allow_double_hack=True)
119 xyz_diff = [vol_empty() for axis in range(discr.dimensions)]
120 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff]
121
122 if "cuda_fastbench" in self.discr.debug:
123 count = 1
124 else:
125 count = 20
126
127 gpu_diffmats = self.gpu_diffmats(op_class, elgroup)
128
129 start = cuda.Event()
130 start.record()
131 cuda.Context.synchronize()
132 for i in range(count):
133 try:
134 func.prepared_call(self.grid, gpu_diffmats.device_memory,
135 *xyz_diff_gpudata)
136 except cuda.LaunchError:
137 return None
138
139 stop = cuda.Event()
140 stop.record()
141 stop.synchronize()
142
143 return (1e-3/count * stop.time_since(start),
144 func.local_size_bytes, func.shared_size_bytes, func.num_regs)
145
147 discr = self.discr
148 given = self.plan.given
149
150 d = discr.dimensions
151 elgroup, = discr.element_groups
152
153 func, field_texref = self.get_kernel(op_class, elgroup)
154
155 assert field.dtype == given.float_type
156
157 field.bind_to_texref_ext(field_texref, allow_double_hack=True)
158
159 xyz_diff = [discr.volume_empty() for axis in range(d)]
160 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff]
161
162 gpu_diffmats = self.gpu_diffmats(op_class, elgroup)
163
164 if discr.instrumented:
165 discr.diff_op_timer.add_timer_callable(func.prepared_timed_call(
166 self.grid, gpu_diffmats.device_memory,
167 *xyz_diff_gpudata))
168
169 from pytools import product
170 discr.gmem_bytes_diff.add(
171 given.float_size()
172 * (
173
174 gpu_diffmats.block_floats * product(self.grid)
175
176 + given.dofs_per_el()
177 * given.dofs_per_el() * given.microblock.elements
178 * self.grid[1] * self.plan.parallelism.total()
179
180 + len(discr.nodes)
181 ))
182 else:
183 func.prepared_call(self.grid, gpu_diffmats.device_memory,
184 *xyz_diff_gpudata)
185
186 if False:
187 copied_debugbuf = debugbuf.get()
188 print "DEBUG"
189
190 print copied_debugbuf[:100].reshape((10,10))
191 raw_input()
192
193 return xyz_diff
194
195 @memoize_method
196 - def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
197 from codepy.cgen import \
198 Pointer, POD, Value, ArrayOf, \
199 Module, FunctionDeclaration, FunctionBody, Block, \
200 Line, Define, Include, \
201 Initializer, If, For, Statement, Assign
202
203 from codepy.cgen import dtype_to_ctype
204 from codepy.cgen.cuda import CudaShared, CudaGlobal
205
206 discr = self.discr
207 d = discr.dimensions
208 dims = range(d)
209 given = self.plan.given
210
211 par = self.plan.parallelism
212
213 diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
214 elgroup, = discr.element_groups
215
216 float_type = given.float_type
217
218 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"),
219 [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
220
221 ] + [Pointer(POD(float_type, "dxyz%d" % i)) for i in dims]
222 ))
223
224 rst_channels = given.devdata.make_valid_tex_channel_count(d)
225 cmod = Module([
226 Include("pycuda-helpers.hpp"),
227 Line(),
228 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
229 % dtype_to_ctype(float_type),
230 "rst_to_xyz_tex"),
231 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
232 % dtype_to_ctype(float_type),
233 "field_tex"),
234 Line(),
235 Define("DIMENSIONS", discr.dimensions),
236 Define("DOFS_PER_EL", given.dofs_per_el()),
237 Line(),
238 Define("SEGMENT_DOF", "threadIdx.x"),
239 Define("PAR_MB_NR", "threadIdx.y"),
240 Line(),
241 Define("MB_SEGMENT", "blockIdx.x"),
242 Define("MACROBLOCK_NR", "blockIdx.y"),
243 Line(),
244 Define("DOFS_PER_SEGMENT", self.plan.segment_size),
245 Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
246 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
247 Define("ELS_PER_MB", given.microblock.elements),
248 Line(),
249 Define("PAR_MB_COUNT", par.parallel),
250 Define("INLINE_MB_COUNT", par.inline),
251 Define("SEQ_MB_COUNT", par.serial),
252 Line(),
253 Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
254 Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
255 Line(),
256 Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
257 Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
258 Define("GLOBAL_MB_NR_BASE",
259 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
260 Define("GLOBAL_MB_NR",
261 "(GLOBAL_MB_NR_BASE"
262 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
263 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
264 Line(),
265 Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
266 Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)"
267 % given.float_size()),
268 Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
269 Line(),
270 CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"),
271 "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
272 Line(),
273 ])
274
275 S = Statement
276 f_body = Block()
277
278 f_body.extend_log_block("calculate responsibility data", [
279 Initializer(POD(numpy.uint16, "mb_el"),
280 "MB_DOF/DOFS_PER_EL"),
281 ])
282
283 from hedge.backends.cuda.tools import get_load_code
284 f_body.extend(
285 get_load_code(
286 dest="smem_diff_rst_mat",
287 base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
288 bytes="DIFFMAT_SEGMENT_BYTES",
289 descr="load diff mat segment")
290 +[S("__syncthreads()"), Line()])
291
292
293 def get_scalar_diff_code():
294 code = []
295 for inl in range(par.inline):
296 for axis in dims:
297 code.append(
298 Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))
299
300 code.append(Line())
301
302 def get_mat_entry(row, col, axis):
303 return ("smem_diff_rst_mat["
304 "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
305 " + %(col)s"
306 "]" % {"row":row, "col":col, "axis":axis}
307 )
308
309 tex_channels = ["x", "y", "z", "w"]
310 from hedge.backends.cuda.tools import unroll
311 code.extend(
312 [POD(float_type, "field_value%d" % inl)
313 for inl in range(par.inline)]
314 +[Line()]
315 +unroll(lambda j: [
316 Assign("field_value%d" % inl,
317 "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
318 "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)
319 )
320 for inl in range(par.inline)]
321 +[Line()]
322 +[S("d%drst%d += %s * field_value%d"
323 % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
324 for axis in dims
325 for inl in range(par.inline)]
326 +[Line()],
327 given.dofs_per_el(), self.plan.max_unroll)
328 )
329
330 store_code = Block()
331 for inl in range(par.inline):
332 for glob_axis in dims:
333 store_code.append(Assign(
334 "dxyz%d[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]"
335 % (glob_axis, inl),
336 " + ".join(
337 "fp_tex1Dfetch(rst_to_xyz_tex, %(loc_axis)d + "
338 "DIMENSIONS*(%(glob_axis)d + DIMENSIONS*("
339 "(GLOBAL_MB_NR+%(inl)d)*ELS_PER_MB + mb_el)))"
340 "* d%(inl)drst%(loc_axis)d" % {
341 "loc_axis": loc_axis,
342 "glob_axis": glob_axis,
343 "inl": inl
344 }
345 for loc_axis in dims
346 )
347 ))
348
349 code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))
350
351 return code
352
353 f_body.extend([
354 For("unsigned short seq_mb_number = 0",
355 "seq_mb_number < SEQ_MB_COUNT",
356 "++seq_mb_number",
357 Block(get_scalar_diff_code()))
358 ])
359
360
361 cmod.append(FunctionBody(f_decl, f_body))
362
363 if not for_benchmark and "cuda_dump_kernels" in discr.debug:
364 open("diff.cu", "w").write(str(cmod))
365
366 mod = SourceModule(cmod,
367 keep="cuda_keep_kernels" in discr.debug,
368
369 )
370
371 if for_benchmark:
372 rst_to_xyz = self.fake_localop_rst_to_xyz()
373 else:
374 rst_to_xyz = self.localop_rst_to_xyz(diff_op_cls, elgroup)
375
376 rst_to_xyz_texref = mod.get_texref("rst_to_xyz_tex")
377 rst_to_xyz.gpu_data.bind_to_texref_ext(rst_to_xyz_texref,
378 allow_double_hack=True)
379
380 field_texref = mod.get_texref("field_tex")
381
382 func = mod.get_function("apply_diff_mat")
383 func.prepare(
384 discr.dimensions*[float_type] + ["P"],
385 block=(self.plan.segment_size, par.parallel, 1),
386 texrefs=[field_texref, rst_to_xyz_texref])
387
388 if "cuda_diff" in discr.debug:
389 print "diff: lmem=%d smem=%d regs=%d" % (
390 func.local_size_bytes,
391 func.shared_size_bytes,
392 func.num_regs)
393
394 return func, field_texref
395
396
397 @memoize_method
399 discr = self.discr
400 given = self.plan.given
401
402 columns = given.dofs_per_el()*discr.dimensions
403 additional_columns = 0
404
405 if columns % 2 == 0:
406 columns += 1
407 additional_columns += 1
408
409 block_floats = given.devdata.align_dtype(
410 columns*self.plan.segment_size, given.float_size())
411
412 vstacked_matrices = [
413 numpy.vstack(given.microblock.elements*(m,))
414 for m in diff_op_cls.matrices(elgroup)
415 ]
416
417 segments = []
418
419 from pytools import single_valued
420 for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size):
421 matrices = [
422 m[segment_start:segment_start+self.plan.segment_size]
423 for m in vstacked_matrices]
424
425 matrices.append(
426 numpy.zeros((single_valued(m.shape[0] for m in matrices),
427 additional_columns))
428 )
429
430 diffmats = numpy.asarray(
431 numpy.hstack(matrices),
432 dtype=given.float_type,
433 order="C")
434 segments.append(buffer(diffmats))
435
436 from hedge.backends.cuda.tools import pad_and_join
437
438 from pytools import Record
439 class GPUDifferentiationMatrices(Record): pass
440
441 return GPUDifferentiationMatrices(
442 device_memory=cuda.to_device(
443 pad_and_join(segments, block_floats*given.float_size())),
444 block_floats=block_floats,
445 matrix_columns=columns)
446