1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 from pytools import memoize_method
26 import pycuda.driver as cuda
27 import pycuda.gpuarray as gpuarray
28 from pycuda.compiler import SourceModule
29 from hedge.backends.cuda.tools import FakeGPUArray
30 import hedge.backends.cuda.plan
31
32
33
34
35
36 -class ExecutionPlan(hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan):
37 - def __init__(self, given, parallelism, segment_size, max_unroll,
38 use_prefetch_branch, debug_name,
39 aligned_preimage_dofs_per_microblock, preimage_dofs_per_el):
40 hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan.__init__(
41 self, given, parallelism, segment_size, max_unroll)
42
43 self.use_prefetch_branch = use_prefetch_branch
44
45 self.debug_name = debug_name
46 self.aligned_preimage_dofs_per_microblock = \
47 aligned_preimage_dofs_per_microblock
48 self.preimage_dofs_per_el = preimage_dofs_per_el
49
51 return self.preimage_dofs_per_el
52
61
62 @memoize_method
67
69 return 12 + self.parallelism.inline
70
73
78
79 @staticmethod
81 return ("type text",
82 "parallel integer",
83 "inline integer",
84 "serial integer",
85 "segment_size integer",
86 "max_unroll integer",
87 "mb_elements integer",
88 "lmem integer",
89 "smem integer",
90 "registers integer",
91 "threads integer",
92 )
93
94 - def features(self, lmem, smem, registers):
95 return ("smem_matrix",
96 self.parallelism.parallel,
97 self.parallelism.inline,
98 self.parallelism.serial,
99 self.segment_size,
100 self.max_unroll,
101 self.given.microblock.elements,
102 lmem,
103 smem,
104 registers,
105 self.threads(),
106 )
107
109 return Kernel(discr, self, with_index_check)
110
115 - def __init__(self, discr, plan, with_index_check):
126
145
146 out_vector = vol_empty()
147 in_vector = gpuarray.empty(
148 given.matmul_preimage_shape(self.plan),
149 dtype=given.float_type,
150 allocator=discr.pool.allocate)
151 in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True)
152
153 fake_matrix = self.prepare_matrix(
154 numpy.ones(
155 (given.dofs_per_el(), self.plan.preimage_dofs_per_el),
156 dtype=given.float_type))
157
158 from hedge.backends.cuda.kernelbase import fake_elwise_scaling
159 fake_scaling = fake_elwise_scaling(self.plan.given)
160 fake_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True)
161
162 if "cuda_fastbench" in discr.debug:
163 count = 1
164 else:
165 count = 20
166
167 start = cuda.Event()
168 start.record()
169 cuda.Context.synchronize()
170 for i in range(count):
171 try:
172 estimated_mb_count = given.block_count*given.microblocks_per_block
173 kernel.prepared_call(
174 self.grid,
175 out_vector.gpudata,
176 fake_matrix,
177 0,
178 estimated_mb_count)
179 except cuda.LaunchError:
180 return None
181
182 stop = cuda.Event()
183 stop.record()
184 stop.synchronize()
185
186 return (1e-3/count * stop.time_since(start),
187 kernel.local_size_bytes, kernel.shared_size_bytes, kernel.num_regs)
188
189 - def __call__(self, in_vector, prepped_mat, prepped_scaling):
190 discr = self.discr
191 elgroup, = discr.element_groups
192 given = self.plan.given
193
194 kernel, in_vector_texref, scaling_texref = \
195 self.get_kernel(prepped_scaling is not None)
196
197 out_vector = discr.volume_empty()
198 in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True)
199 if prepped_scaling is not None:
200 prepped_scaling.bind_to_texref_ext(scaling_texref,
201 allow_double_hack=True)
202
203 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
204 debugbuf = gpuarray.zeros((1024,), dtype=given.float_type)
205 else:
206 debugbuf = FakeGPUArray()
207
208 if discr.instrumented:
209 discr.el_local_timer.add_timer_callable(
210 kernel.prepared_timed_call(
211 self.grid,
212 out_vector.gpudata,
213 prepped_mat,
214 debugbuf.gpudata,
215 len(discr.blocks)*given.microblocks_per_block,
216 ))
217
218 from pytools import product
219 discr.gmem_bytes_el_local.add(
220 given.float_size()
221 * (
222
223 self.plan.gpu_matrix_block_floats() * product(self.grid)
224
225 + self.plan.preimage_dofs_per_el
226 * given.dofs_per_el() * given.microblock.elements
227 * self.grid[1] * self.plan.parallelism.total()
228
229 + len(discr.nodes)
230 ))
231 else:
232 kernel.prepared_call(
233 self.grid,
234 out_vector.gpudata,
235 prepped_mat,
236 debugbuf.gpudata,
237 len(discr.blocks)*given.microblocks_per_block,
238 )
239
240 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
241 copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7))
242 print "DEBUG"
243 numpy.set_printoptions(linewidth=100)
244 copied_debugbuf.shape = (144,7)
245 numpy.set_printoptions(threshold=3000)
246
247 print copied_debugbuf
248 raw_input()
249
250 return out_vector
251
252 @memoize_method
253 - def get_kernel(self, with_scaling, for_benchmark=False):
254 from codepy.cgen import \
255 Pointer, POD, Value, ArrayOf, \
256 Module, FunctionDeclaration, FunctionBody, Block, \
257 Line, Define, Include, \
258 Initializer, If, For, Statement, Assign, \
259 ArrayInitializer
260
261 from codepy.cgen import dtype_to_ctype
262 from codepy.cgen.cuda import CudaShared, CudaConstant, CudaGlobal
263
264 discr = self.discr
265 d = discr.dimensions
266 dims = range(d)
267 given = self.plan.given
268
269 float_type = given.float_type
270
271 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"),
272 [
273 Pointer(POD(float_type, "out_vector")),
274 Pointer(POD(numpy.uint8, "gmem_matrix")),
275 Pointer(POD(float_type, "debugbuf")),
276 POD(numpy.uint32, "microblock_count"),
277 ]
278 ))
279
280 cmod = Module([
281 Include("pycuda-helpers.hpp"),
282 Line(),
283 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
284 % dtype_to_ctype(float_type),
285 "in_vector_tex"),
286 ])
287 if with_scaling:
288 cmod.append(
289 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
290 % dtype_to_ctype(float_type),
291 "scaling_tex"),
292 )
293
294 par = self.plan.parallelism
295
296 cmod.extend([
297 Line(),
298 Define("DIMENSIONS", discr.dimensions),
299 Define("DOFS_PER_EL", given.dofs_per_el()),
300 Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
301 Line(),
302 Define("SEGMENT_DOF", "threadIdx.x"),
303 Define("PAR_MB_NR", "threadIdx.y"),
304 Line(),
305 Define("MB_SEGMENT", "blockIdx.x"),
306 Define("MACROBLOCK_NR", "blockIdx.y"),
307 Line(),
308 Define("DOFS_PER_SEGMENT", self.plan.segment_size),
309 Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
310 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
311 Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
312 self.plan.aligned_preimage_dofs_per_microblock),
313 Define("MB_EL_COUNT", given.microblock.elements),
314 Line(),
315 Define("PAR_MB_COUNT", par.parallel),
316 Define("INLINE_MB_COUNT", par.inline),
317 Define("SEQ_MB_COUNT", par.serial),
318 Line(),
319 Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
320 Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
321 Line(),
322 Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
323 Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
324 Define("GLOBAL_MB_NR_BASE",
325 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
326 Define("GLOBAL_MB_NR",
327 "(GLOBAL_MB_NR_BASE"
328 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
329 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
330 Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
331 Line(),
332 Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
333 Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()),
334 Define("MATRIX_SEGMENT_BYTES",
335 "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),
336
337 Line(),
338 CudaShared(ArrayOf(POD(float_type, "smem_matrix"),
339 "MATRIX_SEGMENT_FLOATS")),
340 CudaShared(
341 ArrayOf(
342 ArrayOf(
343 ArrayOf(
344 POD(float_type, "dof_buffer"),
345 "PAR_MB_COUNT"),
346 "INLINE_MB_COUNT"),
347 "DOFS_PER_SEGMENT"),
348 ),
349 CudaShared(POD(numpy.uint16, "segment_start_el")),
350 CudaShared(POD(numpy.uint16, "segment_stop_el")),
351 CudaShared(POD(numpy.uint16, "segment_el_count")),
352 Line(),
353 ArrayInitializer(
354 CudaConstant(
355 ArrayOf(
356 POD(numpy.uint32, "segment_start_el_lookup"),
357 "SEGMENTS_PER_MB")),
358 [(chk*self.plan.segment_size)//given.dofs_per_el()
359 for chk in range(self.plan.segments_per_microblock())]
360 ),
361 ArrayInitializer(
362 CudaConstant(
363 ArrayOf(
364 POD(numpy.uint32, "segment_stop_el_lookup"),
365 "SEGMENTS_PER_MB")),
366 [min(given.microblock.elements,
367 (chk*self.plan.segment_size+self.plan.segment_size-1)
368 //given.dofs_per_el()+1)
369 for chk in range(self.plan.segments_per_microblock())]
370 ),
371 ])
372
373 S = Statement
374 f_body = Block()
375
376 f_body.extend_log_block("calculate this dof's element", [
377 Initializer(POD(numpy.uint8, "mb_el"),
378 "MB_DOF/DOFS_PER_EL") ])
379
380 if self.plan.use_prefetch_branch:
381 f_body.extend_log_block("calculate segment responsibility data", [
382 If("THREAD_NUM==0",
383 Block([
384 Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"),
385 Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"),
386 Assign("segment_el_count", "segment_stop_el-segment_start_el"),
387 ])
388 ),
389 S("__syncthreads()")
390 ])
391
392 from hedge.backends.cuda.tools import get_load_code
393 f_body.extend(
394 get_load_code(
395 dest="smem_matrix",
396 base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
397 bytes="MATRIX_SEGMENT_BYTES",
398 descr="load matrix segment")
399 +[S("__syncthreads()")]
400 )
401
402
403 def get_batched_fetch_mat_mul_code(el_fetch_count):
404 result = []
405 dofs = range(self.plan.preimage_dofs_per_el)
406
407 for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
408 self.plan.segment_size):
409 result.extend(
410 [S("__syncthreads()")]
411 +[Assign(
412 "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl,
413 "fp_tex1Dfetch(in_vector_tex, "
414 "GLOBAL_MB_PREIMG_DOF_BASE"
415 " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
416 " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
417 % (inl, load_segment_start)
418 )
419 for inl in range(par.inline)]
420 +[S("__syncthreads()"),
421 Line(),
422 ])
423
424 for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]:
425 for inl in range(par.inline):
426 result.append(
427 S("result%d += "
428 "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
429 "*"
430 "dof_buffer[PAR_MB_NR][%d][%d]"
431 % (inl, dof, inl, dof-load_segment_start))
432 )
433 result.append(Line())
434 return result
435
436 from hedge.backends.cuda.tools import unroll
437 def get_direct_tex_mat_mul_code():
438 return (
439 [POD(float_type, "fof%d" % inl) for inl in range(par.inline)]
440 + [POD(float_type, "lm"), Line()]
441 + unroll(
442 lambda j: [
443 Assign("fof%d" % inl,
444 "fp_tex1Dfetch(in_vector_tex, "
445 "GLOBAL_MB_PREIMG_DOF_BASE"
446 " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
447 " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)"
448 % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},)
449 for inl in range(par.inline)
450 ]+[
451 Assign("lm",
452 "smem_matrix["
453 "%(row)s*MATRIX_COLUMNS + %(j)s]"
454 % {"j":j, "row": "SEGMENT_DOF"},
455 )
456 ]+[
457 S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl})
458 for inl in range(par.inline)
459 ],
460 total_number=self.plan.preimage_dofs_per_el,
461 max_unroll=self.plan.max_unroll)
462 + [ Line(), ])
463
464 def get_mat_mul_code(el_fetch_count):
465 if el_fetch_count == 1:
466 return get_batched_fetch_mat_mul_code(el_fetch_count)
467 else:
468 return get_direct_tex_mat_mul_code()
469
470 def mat_mul_outer_loop(fetch_count):
471 if with_scaling:
472 inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex,"
473 "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
474 else:
475 inv_jac_multiplier = "1"
476
477 write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
478 if self.with_index_check:
479 write_condition += " && GLOBAL_MB_NR < microblock_count"
480 return For("unsigned short seq_mb_number = 0",
481 "seq_mb_number < SEQ_MB_COUNT",
482 "++seq_mb_number",
483 Block([
484 Initializer(POD(float_type, "result%d" % inl), 0)
485 for inl in range(par.inline)
486 ]+[ Line() ]
487 +get_mat_mul_code(fetch_count)
488 +[
489 If(write_condition,
490 Block([
491 Assign(
492 "out_vector[GLOBAL_MB_DOF_BASE"
493 " + %d*ALIGNED_DOFS_PER_MB"
494 " + MB_DOF]" % inl,
495 "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl}))
496 )
497 for inl in range(par.inline)
498 ])
499 )
500 ])
501 )
502
503 if self.plan.use_prefetch_branch:
504 from codepy.cgen import make_multiple_ifs
505 f_body.append(make_multiple_ifs([
506 ("segment_el_count == %d" % fetch_count,
507 mat_mul_outer_loop(fetch_count))
508 for fetch_count in
509 range(1, self.plan.max_elements_touched_by_segment()+1)]
510 ))
511 else:
512 f_body.append(mat_mul_outer_loop(0))
513
514
515 cmod.append(FunctionBody(f_decl, f_body))
516
517 if not for_benchmark and "cuda_dump_kernels" in discr.debug:
518 open("%s.cu" % self.plan.debug_name, "w").write(str(cmod))
519
520 mod = SourceModule(cmod,
521 keep="cuda_keep_kernels" in discr.debug,
522
523 )
524
525 func = mod.get_function("apply_el_local_mat_smem_mat")
526
527 if self.plan.debug_name in discr.debug:
528 print "%s: lmem=%d smem=%d regs=%d" % (
529 self.plan.debug_name,
530 func.local_size_bytes,
531 func.shared_size_bytes,
532 func.num_regs)
533
534 in_vector_texref = mod.get_texref("in_vector_tex")
535 texrefs = [in_vector_texref]
536
537 if with_scaling:
538 scaling_texref = mod.get_texref("scaling_tex")
539 texrefs.append(scaling_texref)
540 else:
541 scaling_texref = None
542
543 func.prepare(
544 "PPPI",
545 block=(self.plan.segment_size, self.plan.parallelism.parallel, 1),
546 texrefs=texrefs)
547
548 return func, in_vector_texref, scaling_texref
549
550
588
593