hedge.backends.cuda.diff_shared

1 """Interface with Nvidia CUDA.""" 2 3 from __future__ import division 4 5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 6 7 __license__ = """ 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see U{http://www.gnu.org/licenses/}. 20 """ 21 22 23 24 25 import numpy 26 import hedge.backends.cuda.plan 27 from pytools import memoize_method 28 import pycuda.driver as cuda 29 import pycuda.gpuarray as gpuarray 30 import hedge.backends.cuda.plan 31 from pycuda.compiler import SourceModule 32 from hedge.backends.cuda.kernelbase import DiffKernelBase

33 34 35 36 37 # plan ------------------------------------------------------------------------ 38 -class ExecutionPlan(hedge.backends.cuda.plan.SMemFieldLocalOpExecutionPlan):

39 - def registers(self):

40 return 16

41 42 @memoize_method

43 - def shared_mem_use(self):

44 given = self.given 45 46 return (64 # parameters, block header, small extra stuff 47 + given.float_size() * ( 48 self.parallelism.parallel 49 * self.parallelism.inline 50 * self.given.microblock.aligned_floats))

51 52 @staticmethod

53 - def feature_columns():

54 return ("type text", 55 "parallel integer", 56 "inline integer", 57 "serial integer", 58 "segment_size integer", 59 "max_unroll integer", 60 "mb_elements integer", 61 "lmem integer", 62 "smem integer", 63 "registers integer", 64 "threads integer", 65 )

66

67 - def features(self, lmem, smem, registers):

68 return ("smem_field", 69 self.parallelism.parallel, 70 self.parallelism.inline, 71 self.parallelism.serial, 72 None, 73 self.max_unroll, 74 self.given.microblock.elements, 75 lmem, 76 smem, 77 registers, 78 self.threads(), 79 )

80

81 - def make_kernel(self, discr):

82 return Kernel(discr, self)

83

84 85 86 87 # kernel ---------------------------------------------------------------------- 88 -class Kernel(DiffKernelBase):

89 - def __init__(self, discr, plan):

90 self.discr = discr 91 self.plan = plan 92 93 from hedge.backends.cuda.tools import int_ceiling 94 95 self.grid = (int_ceiling( 96 self.plan.given.total_dofs() 97 / self.plan.dofs_per_macroblock()), 1)

98

99 - def benchmark(self):

100 if set(["cuda_diff", "cuda_debugbuf"]) <= self.discr.debug: 101 return 0 102 103 discr = self.discr 104 given = self.plan.given 105 elgroup, = discr.element_groups 106 107 from hedge.optemplate import DifferentiationOperator as op_class 108 try: 109 func = self.get_kernel(op_class, elgroup, for_benchmark=True) 110 except cuda.CompileError: 111 return None 112 113 def vol_empty(): 114 from hedge.backends.cuda.tools import int_ceiling 115 dofs = int_ceiling( 116 given.total_dofs(), self.plan.dofs_per_macroblock()) 117 118 import pycuda.gpuarray as gpuarray 119 return gpuarray.empty((dofs,), dtype=given.float_type, 120 allocator=discr.pool.allocate)

121 122 field = vol_empty() 123 field.fill(0) 124 125 xyz_diff = [vol_empty() for axis in range(discr.dimensions)] 126 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff] 127 128 if "cuda_fastbench" in discr.debug: 129 count = 1 130 else: 131 count = 20 132 133 start = cuda.Event() 134 start.record() 135 cuda.Context.synchronize() 136 for i in range(count): 137 try: 138 func.prepared_call(self.grid, 139 0, # debugbuf 140 field.gpudata, 141 *xyz_diff_gpudata) 142 except cuda.LaunchError: 143 return None 144 145 stop = cuda.Event() 146 stop.record() 147 stop.synchronize() 148 149 return (1e-3/count * stop.time_since(start), 150 func.local_size_bytes, func.shared_size_bytes, func.num_regs)

151

152 - def __call__(self, op_class, field):

153 discr = self.discr 154 given = self.plan.given 155 156 d = discr.dimensions 157 elgroup, = discr.element_groups 158 159 func = self.get_kernel(op_class, elgroup) 160 161 assert field.dtype == given.float_type 162 163 use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug 164 if use_debugbuf: 165 debugbuf = gpuarray.zeros((512,), dtype=given.float_type) 166 else: 167 from hedge.backends.cuda.tools import FakeGPUArray 168 debugbuf = FakeGPUArray() 169 170 xyz_diff = [discr.volume_empty() for axis in range(d)] 171 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff] 172 173 if discr.instrumented: 174 discr.diff_op_timer.add_timer_callable( 175 func.prepared_timed_call(self.grid, 176 debugbuf.gpudata, field.gpudata, *xyz_diff_gpudata)) 177 178 block_gmem_floats = ( 179 # matrix fetch 180 given.microblock.aligned_floats 181 * discr.dimensions 182 * given.dofs_per_el() 183 * self.plan.parallelism.serial 184 * self.plan.parallelism.parallel 185 # field fetch 186 + given.microblock.aligned_floats 187 * self.plan.parallelism.total() 188 ) 189 190 gmem_bytes = given.float_size() * ( 191 self.grid[0] * block_gmem_floats 192 # field store 193 + len(discr.nodes)) 194 195 discr.gmem_bytes_diff.add(gmem_bytes) 196 else: 197 func.prepared_call(self.grid, 198 debugbuf.gpudata, field.gpudata, *xyz_diff_gpudata) 199 200 if use_debugbuf: 201 copied_debugbuf = debugbuf.get() 202 print "DEBUG" 203 print field.shape 204 #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16)) 205 print copied_debugbuf 206 raw_input() 207 208 return xyz_diff

209 210 @memoize_method

211 - def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):

212 from codepy.cgen import \ 213 Pointer, POD, Value, ArrayOf, Const, \ 214 Module, FunctionDeclaration, FunctionBody, Block, \ 215 Comment, Line, Define, Include, \ 216 Initializer, If, For, Statement, Assign 217 218 from pycuda.tools import dtype_to_ctype 219 from codepy.cgen.cuda import CudaShared, CudaGlobal 220 221 discr = self.discr 222 d = discr.dimensions 223 dims = range(d) 224 given = self.plan.given 225 226 elgroup, = discr.element_groups 227 float_type = given.float_type 228 229 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"), 230 [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ] 231 + [Pointer(POD(float_type, "dxyz%d" % i)) for i in dims] 232 )) 233 234 par = self.plan.parallelism 235 236 cmod = Module([ 237 Include("pycuda-helpers.hpp"), 238 Line(), 239 Value("texture<%s, 1, cudaReadModeElementType>" 240 % dtype_to_ctype(float_type, with_fp_tex_hack=True), 241 "rst_to_xyz_tex"), 242 ]) 243 244 if float_type == numpy.float64: 245 cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", 246 "diff_rst_mat_tex")) 247 elif float_type == numpy.float32: 248 rst_channels = given.devdata.make_valid_tex_channel_count(d) 249 cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" 250 % rst_channels, "diff_rst_mat_tex")) 251 else: 252 raise ValueError("unsupported float type: %s" % float_type) 253 254 cmod.extend([ 255 Line(), 256 Define("DIMENSIONS", discr.dimensions), 257 Define("DOFS_PER_EL", given.dofs_per_el()), 258 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), 259 Define("ELS_PER_MB", given.microblock.elements), 260 Define("DOFS_PER_MB", "(DOFS_PER_EL*ELS_PER_MB)"), 261 Line(), 262 Define("CHUNK_SIZE", given.devdata.smem_granularity), 263 Define("CHUNK_DOF", "threadIdx.x"), 264 Define("PAR_MB_NR", "threadIdx.y"), 265 Define("CHUNK_NR", "threadIdx.z"), 266 Define("MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), 267 Define("EL_DOF", "(MB_DOF - mb_el*DOFS_PER_EL)"), 268 Line(), 269 Define("MACROBLOCK_NR", "blockIdx.x"), 270 Line(), 271 Define("PAR_MB_COUNT", par.parallel), 272 Define("INLINE_MB_COUNT", par.inline), 273 Define("SEQ_MB_COUNT", par.serial), 274 Line(), 275 Define("GLOBAL_MB_NR_BASE", 276 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), 277 Define("GLOBAL_MB_NR", 278 "(GLOBAL_MB_NR_BASE" 279 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), 280 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), 281 Line(), 282 CudaShared( 283 ArrayOf( 284 ArrayOf( 285 ArrayOf( 286 POD(float_type, "smem_field"), 287 "PAR_MB_COUNT"), 288 "INLINE_MB_COUNT"), 289 "ALIGNED_DOFS_PER_MB")), 290 Line(), 291 ]) 292 293 S = Statement 294 f_body = Block([ 295 Initializer(Const(POD(numpy.uint16, "mb_el")), 296 "MB_DOF / DOFS_PER_EL"), 297 Line(), 298 ]) 299 300 # --------------------------------------------------------------------- 301 def get_scalar_diff_code(): 302 code = [] 303 for inl in range(par.inline): 304 for axis in dims: 305 code.append( 306 Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) 307 308 code.append(Line()) 309 310 tex_channels = ["x", "y", "z", "w"] 311 312 store_code = Block() 313 for inl in range(par.inline): 314 for glob_axis in dims: 315 store_code.append(Assign( 316 "dxyz%d[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" 317 % (glob_axis, inl), 318 " + ".join( 319 "fp_tex1Dfetch(rst_to_xyz_tex, %(loc_axis)d + " 320 "DIMENSIONS*(%(glob_axis)d + DIMENSIONS*(" 321 "(GLOBAL_MB_NR+%(inl)d)*ELS_PER_MB + mb_el)))" 322 "* d%(inl)drst%(loc_axis)d" % { 323 "loc_axis": loc_axis, 324 "glob_axis": glob_axis, 325 "inl": inl 326 } 327 for loc_axis in dims 328 ) 329 )) 330 331 from hedge.backends.cuda.tools import unroll 332 code.extend([ 333 Comment("everybody needs to be done with the old data"), 334 S("__syncthreads()"), 335 Line(), 336 ]+[ 337 Assign("smem_field[PAR_MB_NR][%d][MB_DOF]" % inl, 338 "field[(GLOBAL_MB_NR+%d)*ALIGNED_DOFS_PER_MB + MB_DOF]" % inl) 339 for inl in range(par.inline) 340 ]+[ 341 Line(), 342 Comment("all the new data must be loaded"), 343 S("__syncthreads()"), 344 Line(), 345 ]) 346 347 if float_type == numpy.float32: 348 code.append(Value("float%d" % rst_channels, "dmat_entries")) 349 350 code.extend([ 351 POD(float_type, "field_value%d" % inl) 352 for inl in range(par.inline) 353 ]+[Line()]) 354 355 def unroll_body(j): 356 result = [ 357 Assign("field_value%d" % inl, 358 "smem_field[PAR_MB_NR][%d][mb_el*DOFS_PER_EL+%s]" % (inl, j)) 359 for inl in range(par.inline) 360 ] 361 362 if float_type == numpy.float32: 363 result.append(Assign("dmat_entries", 364 "tex1Dfetch(diff_rst_mat_tex, EL_DOF + %s*DOFS_PER_EL)" % j)) 365 result.extend( 366 S("d%drst%d += dmat_entries.%s * field_value%d" 367 % (inl, axis, tex_channels[axis], inl)) 368 for inl in range(par.inline) 369 for axis in dims) 370 elif float_type == numpy.float64: 371 result.extend( 372 S("d%(inl)drst%(axis)d += " 373 "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d + DIMENSIONS*(EL_DOF + %(j)d*DOFS_PER_EL))" 374 "* field_value%(inl)d" % { 375 "inl": inl, 376 "axis": axis, 377 "j": j 378 }) 379 for inl in range(par.inline) 380 for axis in dims) 381 else: 382 assert False 383 384 return result

385 386 code.append(If("MB_DOF < DOFS_PER_MB", Block(unroll(unroll_body, 387 total_number=given.dofs_per_el(), 388 max_unroll=self.plan.max_unroll) 389 +[store_code]))) 390 391 return code 392 393 f_body.extend([ 394 For("unsigned short seq_mb_number = 0", 395 "seq_mb_number < SEQ_MB_COUNT", 396 "++seq_mb_number", 397 Block(get_scalar_diff_code()) 398 ) 399 ]) 400 401 # finish off ---------------------------------------------------------- 402 cmod.append(FunctionBody(f_decl, f_body)) 403 404 if not for_benchmark and "cuda_dump_kernels" in discr.debug: 405 open("diff.cu", "w").write(str(cmod)) 406 407 mod = SourceModule(cmod, 408 keep="cuda_keep_kernels" in discr.debug, 409 #options=["--maxrregcount=16"] 410 ) 411 412 func = mod.get_function("apply_diff_mat_smem") 413 414 if "cuda_diff" in discr.debug: 415 print "diff: lmem=%d smem=%d regs=%d" % ( 416 func.local_size_bytes, 417 func.shared_size_bytes, 418 func.registers) 419 420 if for_benchmark: 421 rst_to_xyz = self.fake_localop_rst_to_xyz() 422 else: 423 rst_to_xyz = self.localop_rst_to_xyz(diff_op_cls, elgroup) 424 425 rst_to_xyz_texref = mod.get_texref("rst_to_xyz_tex") 426 rst_to_xyz.gpu_data.bind_to_texref_ext(rst_to_xyz_texref, 427 allow_double_hack=True) 428 429 diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") 430 gpu_diffmats = self.gpu_diffmats(diff_op_cls, elgroup) 431 432 if given.float_type == numpy.float32: 433 gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) 434 elif given.float_type == numpy.float64: 435 gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, 436 allow_double_hack=True) 437 else: 438 assert False 439 440 func.prepare( 441 ["PP"] + discr.dimensions*[float_type], 442 block=( 443 given.devdata.smem_granularity, 444 self.plan.parallelism.parallel, 445 given.microblock.aligned_floats//given.devdata.smem_granularity), 446 texrefs=[rst_to_xyz_texref, diff_rst_mat_texref]) 447 return func 448 449 # data blocks ------------------------------------------------------------- 450 @memoize_method

451 - def gpu_diffmats(self, diff_op_cls, elgroup):

452 discr = self.discr 453 given = self.plan.given 454 d = discr.dimensions 455 456 if given.float_type == numpy.float32: 457 first_dim = given.devdata.make_valid_tex_channel_count(d) 458 elif given.float_type == numpy.float64: 459 first_dim = d 460 else: 461 assert False 462 463 result = numpy.zeros((first_dim, given.dofs_per_el(), given.dofs_per_el()), 464 dtype=given.float_type, order="F") 465 for i, dm in enumerate(diff_op_cls.matrices(elgroup)): 466 result[i] = dm 467 468 return gpuarray.to_gpu(result)

469

Source Code for Module hedge.backends.cuda.diff_shared_fld