hedge.backends.cuda.el_local_shared

1 """Interface with Nvidia CUDA.""" 2 3 from __future__ import division 4 5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 6 7 __license__ = """ 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see U{http://www.gnu.org/licenses/}. 20 """ 21 22 23 24 import numpy 25 from pytools import memoize_method 26 import pycuda.driver as cuda 27 import pycuda.gpuarray as gpuarray 28 from pycuda.compiler import SourceModule 29 from hedge.backends.cuda.tools import FakeGPUArray 30 import hedge.backends.cuda.plan

31 32 33 34 35 # plan ------------------------------------------------------------------------ 36 -class ExecutionPlan(hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan):

37 - def __init__(self, given, parallelism, segment_size, max_unroll, 38 use_prefetch_branch, debug_name, 39 aligned_preimage_dofs_per_microblock, preimage_dofs_per_el):

40 hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan.__init__( 41 self, given, parallelism, segment_size, max_unroll) 42 43 self.use_prefetch_branch = use_prefetch_branch 44 45 self.debug_name = debug_name 46 self.aligned_preimage_dofs_per_microblock = \ 47 aligned_preimage_dofs_per_microblock 48 self.preimage_dofs_per_el = preimage_dofs_per_el

49

50 - def columns(self):

51 return self.preimage_dofs_per_el

52

53 - def gpu_matrix_columns(self):

54 columns = self.preimage_dofs_per_el 55 56 # avoid smem fetch bank conflicts by ensuring odd col count 57 if columns % 2 == 0: 58 columns += 1 59 60 return columns

61 62 @memoize_method

63 - def gpu_matrix_block_floats(self):

64 return self.given.devdata.align_dtype( 65 self.gpu_matrix_columns()*self.segment_size, 66 self.given.float_size())

67

68 - def registers(self):

69 return 12 + self.parallelism.inline

70

71 - def fetch_buffer_segments(self):

72 return 1

73

74 - def __str__(self):

75 return "%s prefetch_branch=%s" % ( 76 hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan.__str__(self), 77 self.use_prefetch_branch)

78 79 @staticmethod

80 - def feature_columns():

81 return ("type text", 82 "parallel integer", 83 "inline integer", 84 "serial integer", 85 "segment_size integer", 86 "max_unroll integer", 87 "mb_elements integer", 88 "lmem integer", 89 "smem integer", 90 "registers integer", 91 "threads integer", 92 )

93

94 - def features(self, lmem, smem, registers):

95 return ("smem_matrix", 96 self.parallelism.parallel, 97 self.parallelism.inline, 98 self.parallelism.serial, 99 self.segment_size, 100 self.max_unroll, 101 self.given.microblock.elements, 102 lmem, 103 smem, 104 registers, 105 self.threads(), 106 )

107

108 - def make_kernel(self, discr, with_index_check):

109 return Kernel(discr, self, with_index_check)

110

111 112 113 # kernel ---------------------------------------------------------------------- 114 -class Kernel:

115 - def __init__(self, discr, plan, with_index_check):

116 self.discr = discr 117 self.plan = plan 118 self.with_index_check = with_index_check 119 120 from hedge.backends.cuda.tools import int_ceiling 121 self.grid = (plan.segments_per_microblock(), 122 int_ceiling( 123 self.plan.given.total_dofs() 124 / plan.dofs_per_macroblock()) 125 )

126

127 - def benchmark(self):

128 discr = self.discr 129 given = self.plan.given 130 elgroup, = discr.element_groups 131 132 try: 133 kernel, in_vector_texref, scaling_texref = \ 134 self.get_kernel(with_scaling=True, for_benchmark=True) 135 except cuda.CompileError: 136 return None 137 138 def vol_empty(): 139 from hedge.backends.cuda.tools import int_ceiling 140 dofs = int_ceiling( 141 given.total_dofs(), self.plan.dofs_per_macroblock()) 142 143 return gpuarray.empty((dofs,), dtype=given.float_type, 144 allocator=discr.pool.allocate)

145 146 out_vector = vol_empty() 147 in_vector = gpuarray.empty( 148 given.matmul_preimage_shape(self.plan), 149 dtype=given.float_type, 150 allocator=discr.pool.allocate) 151 in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True) 152 153 fake_matrix = self.prepare_matrix( 154 numpy.ones( 155 (given.dofs_per_el(), self.plan.preimage_dofs_per_el), 156 dtype=given.float_type)) 157 158 from hedge.backends.cuda.kernelbase import fake_elwise_scaling 159 fake_scaling = fake_elwise_scaling(self.plan.given) 160 fake_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True) 161 162 if "cuda_fastbench" in discr.debug: 163 count = 1 164 else: 165 count = 20 166 167 start = cuda.Event() 168 start.record() 169 cuda.Context.synchronize() 170 for i in range(count): 171 try: 172 estimated_mb_count = given.block_count*given.microblocks_per_block 173 kernel.prepared_call( 174 self.grid, 175 out_vector.gpudata, 176 fake_matrix, 177 0, 178 estimated_mb_count) 179 except cuda.LaunchError: 180 return None 181 182 stop = cuda.Event() 183 stop.record() 184 stop.synchronize() 185 186 return (1e-3/count * stop.time_since(start), 187 kernel.local_size_bytes, kernel.shared_size_bytes, kernel.num_regs)

188

189 - def __call__(self, in_vector, prepped_mat, prepped_scaling):

190 discr = self.discr 191 elgroup, = discr.element_groups 192 given = self.plan.given 193 194 kernel, in_vector_texref, scaling_texref = \ 195 self.get_kernel(prepped_scaling is not None) 196 197 out_vector = discr.volume_empty() 198 in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True) 199 if prepped_scaling is not None: 200 prepped_scaling.bind_to_texref_ext(scaling_texref, 201 allow_double_hack=True) 202 203 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: 204 debugbuf = gpuarray.zeros((1024,), dtype=given.float_type) 205 else: 206 debugbuf = FakeGPUArray() 207 208 if discr.instrumented: 209 discr.el_local_timer.add_timer_callable( 210 kernel.prepared_timed_call( 211 self.grid, 212 out_vector.gpudata, 213 prepped_mat, 214 debugbuf.gpudata, 215 len(discr.blocks)*given.microblocks_per_block, 216 )) 217 218 from pytools import product 219 discr.gmem_bytes_el_local.add( 220 given.float_size() 221 * ( 222 # matrix fetch 223 self.plan.gpu_matrix_block_floats() * product(self.grid) 224 # field fetch 225 + self.plan.preimage_dofs_per_el 226 * given.dofs_per_el() * given.microblock.elements 227 * self.grid[1] * self.plan.parallelism.total() 228 # field store 229 + len(discr.nodes) 230 )) 231 else: 232 kernel.prepared_call( 233 self.grid, 234 out_vector.gpudata, 235 prepped_mat, 236 debugbuf.gpudata, 237 len(discr.blocks)*given.microblocks_per_block, 238 ) 239 240 if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: 241 copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7)) 242 print "DEBUG" 243 numpy.set_printoptions(linewidth=100) 244 copied_debugbuf.shape = (144,7) 245 numpy.set_printoptions(threshold=3000) 246 247 print copied_debugbuf 248 raw_input() 249 250 return out_vector

251 252 @memoize_method

253 - def get_kernel(self, with_scaling, for_benchmark=False):

254 from codepy.cgen import \ 255 Pointer, POD, Value, ArrayOf, \ 256 Module, FunctionDeclaration, FunctionBody, Block, \ 257 Line, Define, Include, \ 258 Initializer, If, For, Statement, Assign, \ 259 ArrayInitializer 260 261 from codepy.cgen import dtype_to_ctype 262 from codepy.cgen.cuda import CudaShared, CudaConstant, CudaGlobal 263 264 discr = self.discr 265 d = discr.dimensions 266 dims = range(d) 267 given = self.plan.given 268 269 float_type = given.float_type 270 271 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), 272 [ 273 Pointer(POD(float_type, "out_vector")), 274 Pointer(POD(numpy.uint8, "gmem_matrix")), 275 Pointer(POD(float_type, "debugbuf")), 276 POD(numpy.uint32, "microblock_count"), 277 ] 278 )) 279 280 cmod = Module([ 281 Include("pycuda-helpers.hpp"), 282 Line(), 283 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" 284 % dtype_to_ctype(float_type), 285 "in_vector_tex"), 286 ]) 287 if with_scaling: 288 cmod.append( 289 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" 290 % dtype_to_ctype(float_type), 291 "scaling_tex"), 292 ) 293 294 par = self.plan.parallelism 295 296 cmod.extend([ 297 Line(), 298 Define("DIMENSIONS", discr.dimensions), 299 Define("DOFS_PER_EL", given.dofs_per_el()), 300 Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), 301 Line(), 302 Define("SEGMENT_DOF", "threadIdx.x"), 303 Define("PAR_MB_NR", "threadIdx.y"), 304 Line(), 305 Define("MB_SEGMENT", "blockIdx.x"), 306 Define("MACROBLOCK_NR", "blockIdx.y"), 307 Line(), 308 Define("DOFS_PER_SEGMENT", self.plan.segment_size), 309 Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), 310 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), 311 Define("ALIGNED_PREIMAGE_DOFS_PER_MB", 312 self.plan.aligned_preimage_dofs_per_microblock), 313 Define("MB_EL_COUNT", given.microblock.elements), 314 Line(), 315 Define("PAR_MB_COUNT", par.parallel), 316 Define("INLINE_MB_COUNT", par.inline), 317 Define("SEQ_MB_COUNT", par.serial), 318 Line(), 319 Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), 320 Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), 321 Line(), 322 Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), 323 Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), 324 Define("GLOBAL_MB_NR_BASE", 325 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), 326 Define("GLOBAL_MB_NR", 327 "(GLOBAL_MB_NR_BASE" 328 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), 329 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), 330 Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), 331 Line(), 332 Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), 333 Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), 334 Define("MATRIX_SEGMENT_BYTES", 335 "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), 336 337 Line(), 338 CudaShared(ArrayOf(POD(float_type, "smem_matrix"), 339 "MATRIX_SEGMENT_FLOATS")), 340 CudaShared( 341 ArrayOf( 342 ArrayOf( 343 ArrayOf( 344 POD(float_type, "dof_buffer"), 345 "PAR_MB_COUNT"), 346 "INLINE_MB_COUNT"), 347 "DOFS_PER_SEGMENT"), 348 ), 349 CudaShared(POD(numpy.uint16, "segment_start_el")), 350 CudaShared(POD(numpy.uint16, "segment_stop_el")), 351 CudaShared(POD(numpy.uint16, "segment_el_count")), 352 Line(), 353 ArrayInitializer( 354 CudaConstant( 355 ArrayOf( 356 POD(numpy.uint32, "segment_start_el_lookup"), 357 "SEGMENTS_PER_MB")), 358 [(chk*self.plan.segment_size)//given.dofs_per_el() 359 for chk in range(self.plan.segments_per_microblock())] 360 ), 361 ArrayInitializer( 362 CudaConstant( 363 ArrayOf( 364 POD(numpy.uint32, "segment_stop_el_lookup"), 365 "SEGMENTS_PER_MB")), 366 [min(given.microblock.elements, 367 (chk*self.plan.segment_size+self.plan.segment_size-1) 368 //given.dofs_per_el()+1) 369 for chk in range(self.plan.segments_per_microblock())] 370 ), 371 ]) 372 373 S = Statement 374 f_body = Block() 375 376 f_body.extend_log_block("calculate this dof's element", [ 377 Initializer(POD(numpy.uint8, "mb_el"), 378 "MB_DOF/DOFS_PER_EL") ]) 379 380 if self.plan.use_prefetch_branch: 381 f_body.extend_log_block("calculate segment responsibility data", [ 382 If("THREAD_NUM==0", 383 Block([ 384 Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), 385 Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), 386 Assign("segment_el_count", "segment_stop_el-segment_start_el"), 387 ]) 388 ), 389 S("__syncthreads()") 390 ]) 391 392 from hedge.backends.cuda.tools import get_load_code 393 f_body.extend( 394 get_load_code( 395 dest="smem_matrix", 396 base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), 397 bytes="MATRIX_SEGMENT_BYTES", 398 descr="load matrix segment") 399 +[S("__syncthreads()")] 400 ) 401 402 # --------------------------------------------------------------------- 403 def get_batched_fetch_mat_mul_code(el_fetch_count): 404 result = [] 405 dofs = range(self.plan.preimage_dofs_per_el) 406 407 for load_segment_start in range(0, self.plan.preimage_dofs_per_el, 408 self.plan.segment_size): 409 result.extend( 410 [S("__syncthreads()")] 411 +[Assign( 412 "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, 413 "fp_tex1Dfetch(in_vector_tex, " 414 "GLOBAL_MB_PREIMG_DOF_BASE" 415 " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" 416 " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" 417 % (inl, load_segment_start) 418 ) 419 for inl in range(par.inline)] 420 +[S("__syncthreads()"), 421 Line(), 422 ]) 423 424 for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]: 425 for inl in range(par.inline): 426 result.append( 427 S("result%d += " 428 "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" 429 "*" 430 "dof_buffer[PAR_MB_NR][%d][%d]" 431 % (inl, dof, inl, dof-load_segment_start)) 432 ) 433 result.append(Line()) 434 return result

435 436 from hedge.backends.cuda.tools import unroll 437 def get_direct_tex_mat_mul_code(): 438 return ( 439 [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] 440 + [POD(float_type, "lm"), Line()] 441 + unroll( 442 lambda j: [ 443 Assign("fof%d" % inl, 444 "fp_tex1Dfetch(in_vector_tex, " 445 "GLOBAL_MB_PREIMG_DOF_BASE" 446 " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" 447 " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" 448 % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},) 449 for inl in range(par.inline) 450 ]+[ 451 Assign("lm", 452 "smem_matrix[" 453 "%(row)s*MATRIX_COLUMNS + %(j)s]" 454 % {"j":j, "row": "SEGMENT_DOF"}, 455 ) 456 ]+[ 457 S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl}) 458 for inl in range(par.inline) 459 ], 460 total_number=self.plan.preimage_dofs_per_el, 461 max_unroll=self.plan.max_unroll) 462 + [ Line(), ]) 463 464 def get_mat_mul_code(el_fetch_count): 465 if el_fetch_count == 1: 466 return get_batched_fetch_mat_mul_code(el_fetch_count) 467 else: 468 return get_direct_tex_mat_mul_code() 469 470 def mat_mul_outer_loop(fetch_count): 471 if with_scaling: 472 inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex," 473 "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") 474 else: 475 inv_jac_multiplier = "1" 476 477 write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" 478 if self.with_index_check: 479 write_condition += " && GLOBAL_MB_NR < microblock_count" 480 return For("unsigned short seq_mb_number = 0", 481 "seq_mb_number < SEQ_MB_COUNT", 482 "++seq_mb_number", 483 Block([ 484 Initializer(POD(float_type, "result%d" % inl), 0) 485 for inl in range(par.inline) 486 ]+[ Line() ] 487 +get_mat_mul_code(fetch_count) 488 +[ 489 If(write_condition, 490 Block([ 491 Assign( 492 "out_vector[GLOBAL_MB_DOF_BASE" 493 " + %d*ALIGNED_DOFS_PER_MB" 494 " + MB_DOF]" % inl, 495 "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl})) 496 ) 497 for inl in range(par.inline) 498 ]) 499 ) 500 ]) 501 ) 502 503 if self.plan.use_prefetch_branch: 504 from codepy.cgen import make_multiple_ifs 505 f_body.append(make_multiple_ifs([ 506 ("segment_el_count == %d" % fetch_count, 507 mat_mul_outer_loop(fetch_count)) 508 for fetch_count in 509 range(1, self.plan.max_elements_touched_by_segment()+1)] 510 )) 511 else: 512 f_body.append(mat_mul_outer_loop(0)) 513 514 # finish off ---------------------------------------------------------- 515 cmod.append(FunctionBody(f_decl, f_body)) 516 517 if not for_benchmark and "cuda_dump_kernels" in discr.debug: 518 open("%s.cu" % self.plan.debug_name, "w").write(str(cmod)) 519 520 mod = SourceModule(cmod, 521 keep="cuda_keep_kernels" in discr.debug, 522 #options=["--maxrregcount=12"] 523 ) 524 525 func = mod.get_function("apply_el_local_mat_smem_mat") 526 527 if self.plan.debug_name in discr.debug: 528 print "%s: lmem=%d smem=%d regs=%d" % ( 529 self.plan.debug_name, 530 func.local_size_bytes, 531 func.shared_size_bytes, 532 func.num_regs) 533 534 in_vector_texref = mod.get_texref("in_vector_tex") 535 texrefs = [in_vector_texref] 536 537 if with_scaling: 538 scaling_texref = mod.get_texref("scaling_tex") 539 texrefs.append(scaling_texref) 540 else: 541 scaling_texref = None 542 543 func.prepare( 544 "PPPI", 545 block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), 546 texrefs=texrefs) 547 548 return func, in_vector_texref, scaling_texref 549 550 # data blocks -------------------------------------------------------------

551 - def prepare_matrix(self, matrix):

552 discr = self.discr 553 given = self.plan.given 554 555 assert matrix.shape == (given.dofs_per_el(), self.plan.preimage_dofs_per_el) 556 557 columns = self.plan.gpu_matrix_columns() 558 block_floats = self.plan.gpu_matrix_block_floats() 559 560 vstacked_matrix = numpy.vstack( 561 given.microblock.elements*(matrix,) 562 ) 563 564 if vstacked_matrix.shape[1] < columns: 565 vstacked_matrix = numpy.hstack(( 566 vstacked_matrix, 567 numpy.zeros(( 568 vstacked_matrix.shape[0], 569 columns-vstacked_matrix.shape[1] 570 )) 571 )) 572 573 segments = [ 574 buffer(numpy.asarray( 575 vstacked_matrix[ 576 segment_start:segment_start+self.plan.segment_size], 577 dtype=given.float_type, 578 order="C")) 579 for segment_start in range( 580 0, given.microblock.elements*given.dofs_per_el(), 581 self.plan.segment_size) 582 ] 583 584 from hedge.backends.cuda.tools import pad_and_join 585 586 return cuda.to_device( 587 pad_and_join(segments, block_floats*given.float_size()))

588

589 - def prepare_scaling(self, elgroup, scaling):

590 ij = scaling[self.discr.elgroup_microblock_indices(elgroup)] 591 return gpuarray.to_gpu( 592 ij.astype(self.plan.given.float_type))

593

Source Code for Module hedge.backends.cuda.el_local_shared_segmat