1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 from pytools import memoize_method
26 import pycuda.driver as cuda
27 import pycuda.gpuarray as gpuarray
28 from pycuda.compiler import SourceModule
29 from hedge.backends.cuda.tools import FakeGPUArray
30 import hedge.backends.cuda.plan
31
32
33
34
35
36 -class ExecutionPlan(hedge.backends.cuda.plan.SMemFieldLocalOpExecutionPlan):
37 - def __init__(self, given, parallelism, max_unroll, debug_name,
38 aligned_preimage_dofs_per_microblock, preimage_dofs_per_el):
46
49
50 @memoize_method
52 given = self.given
53
54 return (64
55 + given.float_size() * (
56 self.parallelism.parallel
57 * self.parallelism.inline
58 * self.aligned_preimage_dofs_per_microblock))
59
60 @staticmethod
62 return ("type text",
63 "parallel integer",
64 "inline integer",
65 "serial integer",
66 "segment_size integer",
67 "max_unroll integer",
68 "mb_elements integer",
69 "lmem integer",
70 "smem integer",
71 "registers integer",
72 "threads integer",
73 )
74
75 - def features(self, lmem, smem, registers):
76 return ("smem_field",
77 self.parallelism.parallel,
78 self.parallelism.inline,
79 self.parallelism.serial,
80 None,
81 self.max_unroll,
82 self.given.microblock.elements,
83 lmem,
84 smem,
85 registers,
86 self.threads(),
87 )
88
90 return Kernel(discr, self, with_index_check)
91
97 - def __init__(self, discr, plan, with_index_check):
107
109 discr = self.discr
110 given = self.plan.given
111 elgroup, = discr.element_groups
112
113 try:
114 kernel, mat_texref, scaling_texref = \
115 self.get_kernel(with_scaling=True, for_benchmark=True)
116 except cuda.CompileError:
117 return None
118
119 fake_matrix = self.prepare_matrix(
120 numpy.ones(
121 (given.dofs_per_el(), self.plan.preimage_dofs_per_el),
122 dtype=given.float_type))
123 mat_texref.set_array(fake_matrix)
124
125 from hedge.backends.cuda.kernelbase import fake_elwise_scaling
126 fake_scaling = fake_elwise_scaling(self.plan.given)
127 fake_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True)
128
129 def vol_empty():
130 from hedge.backends.cuda.tools import int_ceiling
131 dofs = int_ceiling(given.total_dofs(), self.plan.dofs_per_macroblock())
132
133 return gpuarray.empty((dofs,), dtype=given.float_type,
134 allocator=discr.pool.allocate)
135
136 out_vector = vol_empty()
137 in_vector = gpuarray.empty(
138 given.matmul_preimage_shape(self.plan),
139 dtype=given.float_type,
140 allocator=discr.pool.allocate)
141
142 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
143 debugbuf = gpuarray.zeros((1024,), dtype=given.float_type)
144 else:
145 debugbuf = FakeGPUArray()
146
147 if "cuda_fastbench" in discr.debug:
148 count = 1
149 else:
150 count = 20
151
152 start = cuda.Event()
153 start.record()
154 cuda.Context.synchronize()
155 for i in range(count):
156 try:
157 estimated_mb_count = given.block_count*given.microblocks_per_block
158 kernel.prepared_call(self.grid,
159 out_vector.gpudata,
160 in_vector.gpudata,
161 0,
162 estimated_mb_count,
163 )
164 except cuda.LaunchError:
165 return None
166 stop = cuda.Event()
167 stop.record()
168 stop.synchronize()
169
170 return (1e-3/count * stop.time_since(start),
171 kernel.local_size_bytes, kernel.shared_size_bytes, kernel.num_regs)
172
173 - def __call__(self, in_vector, prepped_mat, prepped_scaling):
174 discr = self.discr
175 elgroup, = discr.element_groups
176 given = self.discr.given
177
178 kernel, mat_texref, scaling_texref = \
179 self.get_kernel(with_scaling=prepped_scaling is not None)
180
181 mat_texref.set_array(prepped_mat)
182 if prepped_scaling is not None:
183 prepped_scaling.bind_to_texref_ext(scaling_texref,
184 allow_double_hack=True)
185
186 out_vector = discr.volume_empty()
187
188 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
189 debugbuf = gpuarray.zeros((1024,), dtype=self.plan.given.float_type)
190 else:
191 debugbuf = FakeGPUArray()
192
193 if discr.instrumented:
194 discr.el_local_timer.add_timer_callable(
195 kernel.prepared_timed_call(self.grid,
196 out_vector.gpudata,
197 in_vector.gpudata,
198 debugbuf.gpudata,
199 len(discr.blocks)*given.microblocks_per_block,
200 ))
201
202 block_gmem_floats = (
203
204 given.microblock.aligned_floats
205 * self.plan.preimage_dofs_per_el
206 * self.plan.parallelism.serial
207 * self.plan.parallelism.parallel
208
209 + self.plan.preimage_dofs_per_el
210 * given.microblock.elements
211 * self.plan.parallelism.total()
212 )
213 gmem_bytes = given.float_size() * (
214 self.grid[0] * block_gmem_floats
215
216 + len(discr.nodes))
217
218 discr.gmem_bytes_el_local.add(gmem_bytes)
219 else:
220 kernel.prepared_call(self.grid,
221 out_vector.gpudata,
222 in_vector.gpudata,
223 debugbuf.gpudata,
224 len(discr.blocks)*given.microblocks_per_block,
225 )
226
227 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug:
228 copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7))
229 print "DEBUG"
230 numpy.set_printoptions(linewidth=100)
231 copied_debugbuf.shape = (144,7)
232 numpy.set_printoptions(threshold=3000)
233
234 print copied_debugbuf
235 raw_input()
236
237 return out_vector
238
239 @memoize_method
240 - def get_kernel(self, with_scaling, for_benchmark=False):
241 from codepy.cgen import \
242 Pointer, POD, Value, ArrayOf, Const, \
243 Module, FunctionDeclaration, FunctionBody, Block, \
244 Comment, Line, Include, \
245 Define, \
246 Initializer, If, For, Statement, Assign
247
248 from codepy.cgen import dtype_to_ctype
249 from codepy.cgen.cuda import CudaShared, CudaGlobal
250
251 discr = self.discr
252 d = discr.dimensions
253 dims = range(d)
254 given = self.plan.given
255
256 float_type = given.float_type
257
258 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"),
259 [
260 Pointer(POD(float_type, "out_vector")),
261 Pointer(POD(float_type, "in_vector")),
262 Pointer(POD(float_type, "debugbuf")),
263 POD(numpy.uint32, "microblock_count"),
264 ]
265 ))
266
267 cmod = Module([
268 Include("pycuda-helpers.hpp"),
269 Line(),
270 Value("texture<fp_tex_%s, 2, cudaReadModeElementType>"
271 % dtype_to_ctype(float_type),
272 "mat_tex"),
273 ])
274
275 if with_scaling:
276 cmod.append(
277 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
278 % dtype_to_ctype(float_type),
279 "scaling_tex"),
280 )
281
282 par = self.plan.parallelism
283
284 cmod.extend([
285 Line(),
286 Define("DIMENSIONS", discr.dimensions),
287 Define("DOFS_PER_EL", given.dofs_per_el()),
288 Define("MB_EL_COUNT", given.microblock.elements),
289 Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
290 Line(),
291 Define("DOFS_PER_MB", "(DOFS_PER_EL*MB_EL_COUNT)"),
292 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
293 Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
294 self.plan.aligned_preimage_dofs_per_microblock),
295 Line(),
296 Define("CHUNK_SIZE", given.devdata.smem_granularity),
297 Define("CHUNK_DOF", "threadIdx.x"),
298 Define("PAR_MB_NR", "threadIdx.y"),
299 Define("CHUNK_NR", "threadIdx.z"),
300 Define("MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
301 Define("EL_DOF", "(MB_DOF - mb_el*DOFS_PER_EL)"),
302 Line(),
303 Define("MACROBLOCK_NR", "blockIdx.x"),
304 Line(),
305 Define("PAR_MB_COUNT", par.parallel),
306 Define("INLINE_MB_COUNT", par.inline),
307 Define("SEQ_MB_COUNT", par.serial),
308 Line(),
309 Define("GLOBAL_MB_NR_BASE",
310 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
311 Define("GLOBAL_MB_NR",
312 "(GLOBAL_MB_NR_BASE"
313 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
314 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
315 Define("GLOBAL_MB_PREIMAGE_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
316 Line(),
317 CudaShared(
318 ArrayOf(
319 ArrayOf(
320 ArrayOf(
321 POD(float_type, "smem_in_vector"),
322 "PAR_MB_COUNT"),
323 "INLINE_MB_COUNT"),
324 "ALIGNED_PREIMAGE_DOFS_PER_MB")),
325 Line(),
326 ])
327
328 S = Statement
329 f_body = Block([
330 Initializer(Const(POD(numpy.uint16, "mb_el")),
331 "MB_DOF / DOFS_PER_EL"),
332 Line(),
333 ])
334
335 def get_load_code():
336 mb_dofs = given.microblock.aligned_floats
337 mb_preimg_dofs = self.plan.aligned_preimage_dofs_per_microblock
338 preimg_dofs_over_dofs = (mb_preimg_dofs+mb_dofs-1) // mb_dofs
339
340 load_code = []
341 store_code = []
342
343 var_num = 0
344 for load_block in range(preimg_dofs_over_dofs):
345 for inl in range(par.inline):
346
347
348
349 var = "tmp%d" % var_num
350 var_num += 1
351 load_code.append(POD(float_type, var))
352
353 block_addr = "%d * ALIGNED_DOFS_PER_MB + MB_DOF" % load_block
354 load_instr = Assign(var,
355 "in_vector[GLOBAL_MB_PREIMAGE_BASE"
356 " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
357 " + %s]" % (inl, block_addr))
358 store_instr = Assign(
359 "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr),
360 var
361 )
362 if (load_block+1)*mb_dofs >= mb_preimg_dofs:
363 cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
364 load_instr = If(cond, load_instr)
365 store_instr = If(cond, store_instr)
366
367 load_code.append(load_instr)
368 store_code.append(store_instr)
369 return Block(load_code + [Line()] + store_code)
370
371 def get_matmul_code():
372 from hedge.backends.cuda.tools import unroll
373
374 if self.with_index_check:
375 index_check_condition = "GLOBAL_MB_NR < microblock_count"
376 else:
377 index_check_condition = ""
378
379 def if_(conditions, then):
380 final_cond = " && ".join(cond for cond in conditions if cond)
381 if final_cond:
382 return If(final_cond, then)
383 else:
384 return then
385
386 if with_scaling:
387 inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex,"
388 "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
389 else:
390 inv_jac_multiplier = "1"
391
392 result = Block([
393 Comment("everybody needs to be done with the old data"),
394 S("__syncthreads()"), Line(),
395 ]+[if_([index_check_condition], get_load_code())]+[
396 Line(),
397 Comment("all the new data must be loaded"),
398 S("__syncthreads()"),
399 Line(),
400 ]+[
401 Initializer(POD(float_type, "result%d" % inl), 0)
402 for inl in range(par.inline)
403 ]+[
404 Line(),
405 POD(float_type, "mat_entry"),
406 Line(),
407 ])
408
409 result.append(if_(["MB_DOF < DOFS_PER_MB", index_check_condition],
410 Block(unroll(lambda j:
411 [Assign("mat_entry", "fp_tex2D(mat_tex, EL_DOF, %s)" % j)]
412 +[
413 S("result%d += mat_entry "
414 "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]"
415 % (inl, inl, j))
416 for inl in range(par.inline)
417 ],
418 total_number=self.plan.preimage_dofs_per_el,
419 max_unroll=self.plan.max_unroll)
420 +[ Line(), ]
421 +[ Assign(
422 "out_vector[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % inl,
423 "result%d*%s" % (inl, (inv_jac_multiplier % {"inl": inl})))
424 for inl in range(par.inline)]
425 )))
426
427 return result
428
429 f_body.append(For("unsigned short seq_mb_number = 0",
430 "seq_mb_number < SEQ_MB_COUNT",
431 "++seq_mb_number", get_matmul_code()))
432
433
434 cmod.append(FunctionBody(f_decl, f_body))
435
436 if not for_benchmark and "cuda_dump_kernels" in discr.debug:
437 open("%s.cu" % self.plan.debug_name, "w").write(str(cmod))
438
439 mod = SourceModule(cmod,
440 keep="cuda_keep_kernels" in discr.debug,
441
442 )
443
444 func = mod.get_function("apply_el_local_mat_smem_field")
445
446 if self.plan.debug_name in discr.debug:
447 print "%s: lmem=%d smem=%d regs=%d" % (
448 self.plan.debug_name,
449 func.local_size_bytes,
450 func.shared_size_bytes,
451 func.num_regs)
452
453 mat_texref = mod.get_texref("mat_tex")
454 texrefs = [mat_texref]
455
456 if with_scaling:
457 scaling_texref = mod.get_texref("scaling_tex")
458 texrefs.append(scaling_texref)
459 else:
460 scaling_texref = None
461
462 func.prepare(
463 "PPPI",
464 block=(
465 given.devdata.smem_granularity,
466 self.plan.parallelism.parallel,
467 given.microblock.aligned_floats//given.devdata.smem_granularity),
468 texrefs=texrefs)
469
470 return func, mat_texref, scaling_texref
471
472
474 given = self.plan.given
475
476 assert matrix.shape == (given.dofs_per_el(), self.plan.preimage_dofs_per_el)
477
478 return cuda.matrix_to_array(matrix.astype(given.float_type), "F",
479 allow_double_hack=True)
480
485