hedge.backends.cuda.el_local_shared

1 """Interface with Nvidia CUDA.""" 2 3 from __future__ import division 4 5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 6 7 __license__ = """ 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see U{http://www.gnu.org/licenses/}. 20 """ 21 22 23 24 import numpy 25 from pytools import memoize_method 26 import pycuda.driver as cuda 27 import pycuda.gpuarray as gpuarray 28 from pycuda.compiler import SourceModule 29 from hedge.backends.cuda.tools import FakeGPUArray 30 import hedge.backends.cuda.plan

31 32 33 34 35 # plan ------------------------------------------------------------------------ 36 -class ExecutionPlan(hedge.backends.cuda.plan.SMemFieldLocalOpExecutionPlan):

37 - def __init__(self, given, parallelism, max_unroll, debug_name, 38 aligned_preimage_dofs_per_microblock, preimage_dofs_per_el):

39 hedge.backends.cuda.plan.SMemFieldLocalOpExecutionPlan.__init__( 40 self, given, parallelism, max_unroll) 41 42 self.debug_name = debug_name 43 self.aligned_preimage_dofs_per_microblock = \ 44 aligned_preimage_dofs_per_microblock 45 self.preimage_dofs_per_el = preimage_dofs_per_el

46

47 - def registers(self):

48 return 16

49 50 @memoize_method

51 - def shared_mem_use(self):

52 given = self.given 53 54 return (64 # parameters, block header, small extra stuff 55 + given.float_size() * ( 56 self.parallelism.parallel 57 * self.parallelism.inline 58 * self.aligned_preimage_dofs_per_microblock))

59 60 @staticmethod

61 - def feature_columns():

62 return ("type text", 63 "parallel integer", 64 "inline integer", 65 "serial integer", 66 "segment_size integer", 67 "max_unroll integer", 68 "mb_elements integer", 69 "lmem integer", 70 "smem integer", 71 "registers integer", 72 "threads integer", 73 )

74

75 - def features(self, lmem, smem, registers):

76 return ("smem_field", 77 self.parallelism.parallel, 78 self.parallelism.inline, 79 self.parallelism.serial, 80 None, 81 self.max_unroll, 82 self.given.microblock.elements, 83 lmem, 84 smem, 85 registers, 86 self.threads(), 87 )

88

89 - def make_kernel(self, discr, with_index_check):

90 return Kernel(discr, self, with_index_check)

91

92 93 94 95 # kernel ---------------------------------------------------------------------- 96 -class Kernel:

97 - def __init__(self, discr, plan, with_index_check):

98 self.discr = discr 99 self.plan = plan 100 self.with_index_check = with_index_check 101 102 from hedge.backends.cuda.tools import int_ceiling 103 self.grid = (int_ceiling( 104 self.plan.given.total_dofs() 105 / self.plan.dofs_per_macroblock()), 106 1)

107

108 - def benchmark(self):

109 discr = self.discr 110 given = self.plan.given 111 elgroup, = discr.element_groups 112 113 try: 114 kernel, mat_texref, scaling_texref = \ 115 self.get_kernel(with_scaling=True, for_benchmark=True) 116 except cuda.CompileError: 117 return None 118 119 fake_matrix = self.prepare_matrix( 120 numpy.ones( 121 (given.dofs_per_el(), self.plan.preimage_dofs_per_el), 122 dtype=given.float_type)) 123 mat_texref.set_array(fake_matrix) 124 125 from hedge.backends.cuda.kernelbase import fake_elwise_scaling 126 fake_scaling = fake_elwise_scaling(self.plan.given) 127 fake_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True) 128 129 def vol_empty(): 130 from hedge.backends.cuda.tools import int_ceiling 131 dofs = int_ceiling(given.total_dofs(), self.plan.dofs_per_macroblock()) 132 133 return gpuarray.empty((dofs,), dtype=given.float_type, 134 allocator=discr.pool.allocate)

135 136 out_vector = vol_empty() 137 in_vector = gpuarray.empty( 138 given.matmul_preimage_shape(self.plan), 139 dtype=given.float_type, 140 allocator=discr.pool.allocate) 141 142 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: 143 debugbuf = gpuarray.zeros((1024,), dtype=given.float_type) 144 else: 145 debugbuf = FakeGPUArray() 146 147 if "cuda_fastbench" in discr.debug: 148 count = 1 149 else: 150 count = 20 151 152 start = cuda.Event() 153 start.record() 154 cuda.Context.synchronize() 155 for i in range(count): 156 try: 157 estimated_mb_count = given.block_count*given.microblocks_per_block 158 kernel.prepared_call(self.grid, 159 out_vector.gpudata, 160 in_vector.gpudata, 161 0, 162 estimated_mb_count, 163 ) 164 except cuda.LaunchError: 165 return None 166 stop = cuda.Event() 167 stop.record() 168 stop.synchronize() 169 170 return (1e-3/count * stop.time_since(start), 171 kernel.local_size_bytes, kernel.shared_size_bytes, kernel.num_regs)

172

173 - def __call__(self, in_vector, prepped_mat, prepped_scaling):

174 discr = self.discr 175 elgroup, = discr.element_groups 176 given = self.discr.given 177 178 kernel, mat_texref, scaling_texref = \ 179 self.get_kernel(with_scaling=prepped_scaling is not None) 180 181 mat_texref.set_array(prepped_mat) 182 if prepped_scaling is not None: 183 prepped_scaling.bind_to_texref_ext(scaling_texref, 184 allow_double_hack=True) 185 186 out_vector = discr.volume_empty() 187 188 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: 189 debugbuf = gpuarray.zeros((1024,), dtype=self.plan.given.float_type) 190 else: 191 debugbuf = FakeGPUArray() 192 193 if discr.instrumented: 194 discr.el_local_timer.add_timer_callable( 195 kernel.prepared_timed_call(self.grid, 196 out_vector.gpudata, 197 in_vector.gpudata, 198 debugbuf.gpudata, 199 len(discr.blocks)*given.microblocks_per_block, 200 )) 201 202 block_gmem_floats = ( 203 # matrix fetch 204 given.microblock.aligned_floats 205 * self.plan.preimage_dofs_per_el 206 * self.plan.parallelism.serial 207 * self.plan.parallelism.parallel 208 # field fetch 209 + self.plan.preimage_dofs_per_el 210 * given.microblock.elements 211 * self.plan.parallelism.total() 212 ) 213 gmem_bytes = given.float_size() * ( 214 self.grid[0] * block_gmem_floats 215 # field store 216 + len(discr.nodes)) 217 218 discr.gmem_bytes_el_local.add(gmem_bytes) 219 else: 220 kernel.prepared_call(self.grid, 221 out_vector.gpudata, 222 in_vector.gpudata, 223 debugbuf.gpudata, 224 len(discr.blocks)*given.microblocks_per_block, 225 ) 226 227 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: 228 copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7)) 229 print "DEBUG" 230 numpy.set_printoptions(linewidth=100) 231 copied_debugbuf.shape = (144,7) 232 numpy.set_printoptions(threshold=3000) 233 234 print copied_debugbuf 235 raw_input() 236 237 return out_vector

238 239 @memoize_method

240 - def get_kernel(self, with_scaling, for_benchmark=False):

241 from codepy.cgen import \ 242 Pointer, POD, Value, ArrayOf, Const, \ 243 Module, FunctionDeclaration, FunctionBody, Block, \ 244 Comment, Line, Include, \ 245 Define, \ 246 Initializer, If, For, Statement, Assign 247 248 from codepy.cgen import dtype_to_ctype 249 from codepy.cgen.cuda import CudaShared, CudaGlobal 250 251 discr = self.discr 252 d = discr.dimensions 253 dims = range(d) 254 given = self.plan.given 255 256 float_type = given.float_type 257 258 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"), 259 [ 260 Pointer(POD(float_type, "out_vector")), 261 Pointer(POD(float_type, "in_vector")), 262 Pointer(POD(float_type, "debugbuf")), 263 POD(numpy.uint32, "microblock_count"), 264 ] 265 )) 266 267 cmod = Module([ 268 Include("pycuda-helpers.hpp"), 269 Line(), 270 Value("texture<fp_tex_%s, 2, cudaReadModeElementType>" 271 % dtype_to_ctype(float_type), 272 "mat_tex"), 273 ]) 274 275 if with_scaling: 276 cmod.append( 277 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" 278 % dtype_to_ctype(float_type), 279 "scaling_tex"), 280 ) 281 282 par = self.plan.parallelism 283 284 cmod.extend([ 285 Line(), 286 Define("DIMENSIONS", discr.dimensions), 287 Define("DOFS_PER_EL", given.dofs_per_el()), 288 Define("MB_EL_COUNT", given.microblock.elements), 289 Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), 290 Line(), 291 Define("DOFS_PER_MB", "(DOFS_PER_EL*MB_EL_COUNT)"), 292 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), 293 Define("ALIGNED_PREIMAGE_DOFS_PER_MB", 294 self.plan.aligned_preimage_dofs_per_microblock), 295 Line(), 296 Define("CHUNK_SIZE", given.devdata.smem_granularity), 297 Define("CHUNK_DOF", "threadIdx.x"), 298 Define("PAR_MB_NR", "threadIdx.y"), 299 Define("CHUNK_NR", "threadIdx.z"), 300 Define("MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), 301 Define("EL_DOF", "(MB_DOF - mb_el*DOFS_PER_EL)"), 302 Line(), 303 Define("MACROBLOCK_NR", "blockIdx.x"), 304 Line(), 305 Define("PAR_MB_COUNT", par.parallel), 306 Define("INLINE_MB_COUNT", par.inline), 307 Define("SEQ_MB_COUNT", par.serial), 308 Line(), 309 Define("GLOBAL_MB_NR_BASE", 310 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), 311 Define("GLOBAL_MB_NR", 312 "(GLOBAL_MB_NR_BASE" 313 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), 314 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), 315 Define("GLOBAL_MB_PREIMAGE_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), 316 Line(), 317 CudaShared( 318 ArrayOf( 319 ArrayOf( 320 ArrayOf( 321 POD(float_type, "smem_in_vector"), 322 "PAR_MB_COUNT"), 323 "INLINE_MB_COUNT"), 324 "ALIGNED_PREIMAGE_DOFS_PER_MB")), 325 Line(), 326 ]) 327 328 S = Statement 329 f_body = Block([ 330 Initializer(Const(POD(numpy.uint16, "mb_el")), 331 "MB_DOF / DOFS_PER_EL"), 332 Line(), 333 ]) 334 335 def get_load_code(): 336 mb_dofs = given.microblock.aligned_floats 337 mb_preimg_dofs = self.plan.aligned_preimage_dofs_per_microblock 338 preimg_dofs_over_dofs = (mb_preimg_dofs+mb_dofs-1) // mb_dofs 339 340 load_code = [] 341 store_code = [] 342 343 var_num = 0 344 for load_block in range(preimg_dofs_over_dofs): 345 for inl in range(par.inline): 346 # load and store are split for better pipelining 347 # compiler can't figure that out because of branch 348 349 var = "tmp%d" % var_num 350 var_num += 1 351 load_code.append(POD(float_type, var)) 352 353 block_addr = "%d * ALIGNED_DOFS_PER_MB + MB_DOF" % load_block 354 load_instr = Assign(var, 355 "in_vector[GLOBAL_MB_PREIMAGE_BASE" 356 " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" 357 " + %s]" % (inl, block_addr)) 358 store_instr = Assign( 359 "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr), 360 var 361 ) 362 if (load_block+1)*mb_dofs >= mb_preimg_dofs: 363 cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr 364 load_instr = If(cond, load_instr) 365 store_instr = If(cond, store_instr) 366 367 load_code.append(load_instr) 368 store_code.append(store_instr) 369 return Block(load_code + [Line()] + store_code)

370 371 def get_matmul_code(): 372 from hedge.backends.cuda.tools import unroll 373 374 if self.with_index_check: 375 index_check_condition = "GLOBAL_MB_NR < microblock_count" 376 else: 377 index_check_condition = "" 378 379 def if_(conditions, then): 380 final_cond = " && ".join(cond for cond in conditions if cond) 381 if final_cond: 382 return If(final_cond, then) 383 else: 384 return then 385 386 if with_scaling: 387 inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex," 388 "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") 389 else: 390 inv_jac_multiplier = "1" 391 392 result = Block([ 393 Comment("everybody needs to be done with the old data"), 394 S("__syncthreads()"), Line(), 395 ]+[if_([index_check_condition], get_load_code())]+[ 396 Line(), 397 Comment("all the new data must be loaded"), 398 S("__syncthreads()"), 399 Line(), 400 ]+[ 401 Initializer(POD(float_type, "result%d" % inl), 0) 402 for inl in range(par.inline) 403 ]+[ 404 Line(), 405 POD(float_type, "mat_entry"), 406 Line(), 407 ]) 408 409 result.append(if_(["MB_DOF < DOFS_PER_MB", index_check_condition], 410 Block(unroll(lambda j: 411 [Assign("mat_entry", "fp_tex2D(mat_tex, EL_DOF, %s)" % j)] 412 +[ 413 S("result%d += mat_entry " 414 "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]" 415 % (inl, inl, j)) 416 for inl in range(par.inline) 417 ], 418 total_number=self.plan.preimage_dofs_per_el, 419 max_unroll=self.plan.max_unroll) 420 +[ Line(), ] 421 +[ Assign( 422 "out_vector[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % inl, 423 "result%d*%s" % (inl, (inv_jac_multiplier % {"inl": inl}))) 424 for inl in range(par.inline)] 425 ))) 426 427 return result 428 429 f_body.append(For("unsigned short seq_mb_number = 0", 430 "seq_mb_number < SEQ_MB_COUNT", 431 "++seq_mb_number", get_matmul_code())) 432 433 # finish off ---------------------------------------------------------- 434 cmod.append(FunctionBody(f_decl, f_body)) 435 436 if not for_benchmark and "cuda_dump_kernels" in discr.debug: 437 open("%s.cu" % self.plan.debug_name, "w").write(str(cmod)) 438 439 mod = SourceModule(cmod, 440 keep="cuda_keep_kernels" in discr.debug, 441 #options=["--maxrregcount=12"] 442 ) 443 444 func = mod.get_function("apply_el_local_mat_smem_field") 445 446 if self.plan.debug_name in discr.debug: 447 print "%s: lmem=%d smem=%d regs=%d" % ( 448 self.plan.debug_name, 449 func.local_size_bytes, 450 func.shared_size_bytes, 451 func.num_regs) 452 453 mat_texref = mod.get_texref("mat_tex") 454 texrefs = [mat_texref] 455 456 if with_scaling: 457 scaling_texref = mod.get_texref("scaling_tex") 458 texrefs.append(scaling_texref) 459 else: 460 scaling_texref = None 461 462 func.prepare( 463 "PPPI", 464 block=( 465 given.devdata.smem_granularity, 466 self.plan.parallelism.parallel, 467 given.microblock.aligned_floats//given.devdata.smem_granularity), 468 texrefs=texrefs) 469 470 return func, mat_texref, scaling_texref 471 472 # data blocks -------------------------------------------------------------

473 - def prepare_matrix(self, matrix):

474 given = self.plan.given 475 476 assert matrix.shape == (given.dofs_per_el(), self.plan.preimage_dofs_per_el) 477 478 return cuda.matrix_to_array(matrix.astype(given.float_type), "F", 479 allow_double_hack=True)

480

481 - def prepare_scaling(self, elgroup, scaling):

482 ij = scaling[self.discr.elgroup_microblock_indices(elgroup)] 483 return gpuarray.to_gpu( 484 ij.astype(self.plan.given.float_type))

485

Source Code for Module hedge.backends.cuda.el_local_shared_fld