Package hedge :: Package backends :: Package cuda :: Module diff_shared_segmat
[hide private]
[frames] | no frames]

Source Code for Module hedge.backends.cuda.diff_shared_segmat

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24   
 25  import numpy 
 26  from pytools import memoize_method 
 27  import pycuda.driver as cuda 
 28  from pycuda.compiler import SourceModule 
 29  import hedge.backends.cuda.plan 
 30  from hedge.backends.cuda.kernelbase import DiffKernelBase 
31 32 33 34 35 # plan ------------------------------------------------------------------------ 36 -class ExecutionPlan(hedge.backends.cuda.plan.SegmentedMatrixLocalOpExecutionPlan):
37 - def columns(self):
38 return self.given.dofs_per_el() * self.given.ldis.dimensions # r,s,t
39
40 - def registers(self):
41 return 16 + 4 * (self.parallelism.inline-1)
42
43 - def fetch_buffer_segments(self):
44 return 0
45 46 @staticmethod
47 - def feature_columns():
48 return ("type text", 49 "parallel integer", 50 "inline integer", 51 "serial integer", 52 "segment_size integer", 53 "max_unroll integer", 54 "mb_elements integer", 55 "lmem integer", 56 "smem integer", 57 "registers integer", 58 "threads integer", 59 )
60
61 - def features(self, lmem, smem, registers):
62 return ("smem_matrix", 63 self.parallelism.parallel, 64 self.parallelism.inline, 65 self.parallelism.serial, 66 self.segment_size, 67 self.max_unroll, 68 self.given.microblock.elements, 69 lmem, 70 smem, 71 registers, 72 self.threads(), 73 )
74
75 - def make_kernel(self, discr):
76 return Kernel(discr, self)
77
78 79 80 81 82 # kernel ---------------------------------------------------------------------- 83 -class Kernel(DiffKernelBase):
84 - def __init__(self, discr, plan):
85 self.discr = discr 86 self.plan = plan 87 88 from hedge.backends.cuda.tools import int_ceiling 89 90 given = self.plan.given 91 self.grid = (plan.segments_per_microblock(), 92 int_ceiling(given.total_dofs()/plan.dofs_per_macroblock()))
93
94 - def benchmark(self):
95 discr = self.discr 96 given = self.plan.given 97 elgroup, = discr.element_groups 98 99 from hedge.optemplate import DifferentiationOperator as op_class 100 try: 101 func, field_texref = self.get_kernel(op_class, elgroup, for_benchmark=True) 102 except cuda.CompileError: 103 return None 104 105 def vol_empty(): 106 from hedge.backends.cuda.tools import int_ceiling 107 dofs = int_ceiling( 108 given.total_dofs(), self.plan.dofs_per_macroblock()) 109 assert dofs == self.grid[1] * self.plan.dofs_per_macroblock() 110 111 import pycuda.gpuarray as gpuarray 112 return gpuarray.empty((dofs,), dtype=given.float_type, 113 allocator=discr.pool.allocate)
114 115 field = vol_empty() 116 field.fill(0) 117 118 field.bind_to_texref_ext(field_texref, allow_double_hack=True) 119 xyz_diff = [vol_empty() for axis in range(discr.dimensions)] 120 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff] 121 122 if "cuda_fastbench" in self.discr.debug: 123 count = 1 124 else: 125 count = 20 126 127 gpu_diffmats = self.gpu_diffmats(op_class, elgroup) 128 129 start = cuda.Event() 130 start.record() 131 cuda.Context.synchronize() 132 for i in range(count): 133 try: 134 func.prepared_call(self.grid, gpu_diffmats.device_memory, 135 *xyz_diff_gpudata) 136 except cuda.LaunchError: 137 return None 138 139 stop = cuda.Event() 140 stop.record() 141 stop.synchronize() 142 143 return (1e-3/count * stop.time_since(start), 144 func.local_size_bytes, func.shared_size_bytes, func.num_regs)
145
146 - def __call__(self, op_class, field):
147 discr = self.discr 148 given = self.plan.given 149 150 d = discr.dimensions 151 elgroup, = discr.element_groups 152 153 func, field_texref = self.get_kernel(op_class, elgroup) 154 155 assert field.dtype == given.float_type 156 157 field.bind_to_texref_ext(field_texref, allow_double_hack=True) 158 159 xyz_diff = [discr.volume_empty() for axis in range(d)] 160 xyz_diff_gpudata = [subarray.gpudata for subarray in xyz_diff] 161 162 gpu_diffmats = self.gpu_diffmats(op_class, elgroup) 163 164 if discr.instrumented: 165 discr.diff_op_timer.add_timer_callable(func.prepared_timed_call( 166 self.grid, gpu_diffmats.device_memory, 167 *xyz_diff_gpudata)) 168 169 from pytools import product 170 discr.gmem_bytes_diff.add( 171 given.float_size() 172 * ( 173 # matrix fetch 174 gpu_diffmats.block_floats * product(self.grid) 175 # field fetch 176 + given.dofs_per_el() 177 * given.dofs_per_el() * given.microblock.elements 178 * self.grid[1] * self.plan.parallelism.total() 179 # field store 180 + len(discr.nodes) 181 )) 182 else: 183 func.prepared_call(self.grid, gpu_diffmats.device_memory, 184 *xyz_diff_gpudata) 185 186 if False: 187 copied_debugbuf = debugbuf.get() 188 print "DEBUG" 189 #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16)) 190 print copied_debugbuf[:100].reshape((10,10)) 191 raw_input() 192 193 return xyz_diff
194 195 @memoize_method
196 - def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
197 from codepy.cgen import \ 198 Pointer, POD, Value, ArrayOf, \ 199 Module, FunctionDeclaration, FunctionBody, Block, \ 200 Line, Define, Include, \ 201 Initializer, If, For, Statement, Assign 202 203 from codepy.cgen import dtype_to_ctype 204 from codepy.cgen.cuda import CudaShared, CudaGlobal 205 206 discr = self.discr 207 d = discr.dimensions 208 dims = range(d) 209 given = self.plan.given 210 211 par = self.plan.parallelism 212 213 diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) 214 elgroup, = discr.element_groups 215 216 float_type = given.float_type 217 218 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"), 219 [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), 220 #Pointer(POD(float_type, "debugbuf")), 221 ] + [Pointer(POD(float_type, "dxyz%d" % i)) for i in dims] 222 )) 223 224 rst_channels = given.devdata.make_valid_tex_channel_count(d) 225 cmod = Module([ 226 Include("pycuda-helpers.hpp"), 227 Line(), 228 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" 229 % dtype_to_ctype(float_type), 230 "rst_to_xyz_tex"), 231 Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" 232 % dtype_to_ctype(float_type), 233 "field_tex"), 234 Line(), 235 Define("DIMENSIONS", discr.dimensions), 236 Define("DOFS_PER_EL", given.dofs_per_el()), 237 Line(), 238 Define("SEGMENT_DOF", "threadIdx.x"), 239 Define("PAR_MB_NR", "threadIdx.y"), 240 Line(), 241 Define("MB_SEGMENT", "blockIdx.x"), 242 Define("MACROBLOCK_NR", "blockIdx.y"), 243 Line(), 244 Define("DOFS_PER_SEGMENT", self.plan.segment_size), 245 Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), 246 Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), 247 Define("ELS_PER_MB", given.microblock.elements), 248 Line(), 249 Define("PAR_MB_COUNT", par.parallel), 250 Define("INLINE_MB_COUNT", par.inline), 251 Define("SEQ_MB_COUNT", par.serial), 252 Line(), 253 Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), 254 Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), 255 Line(), 256 Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), 257 Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), 258 Define("GLOBAL_MB_NR_BASE", 259 "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), 260 Define("GLOBAL_MB_NR", 261 "(GLOBAL_MB_NR_BASE" 262 "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), 263 Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), 264 Line(), 265 Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), 266 Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" 267 % given.float_size()), 268 Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), 269 Line(), 270 CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"), 271 "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), 272 Line(), 273 ]) 274 275 S = Statement 276 f_body = Block() 277 278 f_body.extend_log_block("calculate responsibility data", [ 279 Initializer(POD(numpy.uint16, "mb_el"), 280 "MB_DOF/DOFS_PER_EL"), 281 ]) 282 283 from hedge.backends.cuda.tools import get_load_code 284 f_body.extend( 285 get_load_code( 286 dest="smem_diff_rst_mat", 287 base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", 288 bytes="DIFFMAT_SEGMENT_BYTES", 289 descr="load diff mat segment") 290 +[S("__syncthreads()"), Line()]) 291 292 # --------------------------------------------------------------------- 293 def get_scalar_diff_code(): 294 code = [] 295 for inl in range(par.inline): 296 for axis in dims: 297 code.append( 298 Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) 299 300 code.append(Line()) 301 302 def get_mat_entry(row, col, axis): 303 return ("smem_diff_rst_mat[" 304 "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" 305 " + %(col)s" 306 "]" % {"row":row, "col":col, "axis":axis} 307 )
308 309 tex_channels = ["x", "y", "z", "w"] 310 from hedge.backends.cuda.tools import unroll 311 code.extend( 312 [POD(float_type, "field_value%d" % inl) 313 for inl in range(par.inline)] 314 +[Line()] 315 +unroll(lambda j: [ 316 Assign("field_value%d" % inl, 317 "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " 318 "+ mb_el*DOFS_PER_EL + %s)" % (inl, j) 319 ) 320 for inl in range(par.inline)] 321 +[Line()] 322 +[S("d%drst%d += %s * field_value%d" 323 % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) 324 for axis in dims 325 for inl in range(par.inline)] 326 +[Line()], 327 given.dofs_per_el(), self.plan.max_unroll) 328 ) 329 330 store_code = Block() 331 for inl in range(par.inline): 332 for glob_axis in dims: 333 store_code.append(Assign( 334 "dxyz%d[GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" 335 % (glob_axis, inl), 336 " + ".join( 337 "fp_tex1Dfetch(rst_to_xyz_tex, %(loc_axis)d + " 338 "DIMENSIONS*(%(glob_axis)d + DIMENSIONS*(" 339 "(GLOBAL_MB_NR+%(inl)d)*ELS_PER_MB + mb_el)))" 340 "* d%(inl)drst%(loc_axis)d" % { 341 "loc_axis": loc_axis, 342 "glob_axis": glob_axis, 343 "inl": inl 344 } 345 for loc_axis in dims 346 ) 347 )) 348 349 code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) 350 351 return code 352 353 f_body.extend([ 354 For("unsigned short seq_mb_number = 0", 355 "seq_mb_number < SEQ_MB_COUNT", 356 "++seq_mb_number", 357 Block(get_scalar_diff_code())) 358 ]) 359 360 # finish off ---------------------------------------------------------- 361 cmod.append(FunctionBody(f_decl, f_body)) 362 363 if not for_benchmark and "cuda_dump_kernels" in discr.debug: 364 open("diff.cu", "w").write(str(cmod)) 365 366 mod = SourceModule(cmod, 367 keep="cuda_keep_kernels" in discr.debug, 368 #options=["--maxrregcount=10"] 369 ) 370 371 if for_benchmark: 372 rst_to_xyz = self.fake_localop_rst_to_xyz() 373 else: 374 rst_to_xyz = self.localop_rst_to_xyz(diff_op_cls, elgroup) 375 376 rst_to_xyz_texref = mod.get_texref("rst_to_xyz_tex") 377 rst_to_xyz.gpu_data.bind_to_texref_ext(rst_to_xyz_texref, 378 allow_double_hack=True) 379 380 field_texref = mod.get_texref("field_tex") 381 382 func = mod.get_function("apply_diff_mat") 383 func.prepare( 384 discr.dimensions*[float_type] + ["P"], 385 block=(self.plan.segment_size, par.parallel, 1), 386 texrefs=[field_texref, rst_to_xyz_texref]) 387 388 if "cuda_diff" in discr.debug: 389 print "diff: lmem=%d smem=%d regs=%d" % ( 390 func.local_size_bytes, 391 func.shared_size_bytes, 392 func.num_regs) 393 394 return func, field_texref 395 396 # data blocks ------------------------------------------------------------- 397 @memoize_method
398 - def gpu_diffmats(self, diff_op_cls, elgroup):
399 discr = self.discr 400 given = self.plan.given 401 402 columns = given.dofs_per_el()*discr.dimensions 403 additional_columns = 0 404 # avoid smem fetch bank conflicts by ensuring odd col count 405 if columns % 2 == 0: 406 columns += 1 407 additional_columns += 1 408 409 block_floats = given.devdata.align_dtype( 410 columns*self.plan.segment_size, given.float_size()) 411 412 vstacked_matrices = [ 413 numpy.vstack(given.microblock.elements*(m,)) 414 for m in diff_op_cls.matrices(elgroup) 415 ] 416 417 segments = [] 418 419 from pytools import single_valued 420 for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size): 421 matrices = [ 422 m[segment_start:segment_start+self.plan.segment_size] 423 for m in vstacked_matrices] 424 425 matrices.append( 426 numpy.zeros((single_valued(m.shape[0] for m in matrices), 427 additional_columns)) 428 ) 429 430 diffmats = numpy.asarray( 431 numpy.hstack(matrices), 432 dtype=given.float_type, 433 order="C") 434 segments.append(buffer(diffmats)) 435 436 from hedge.backends.cuda.tools import pad_and_join 437 438 from pytools import Record 439 class GPUDifferentiationMatrices(Record): pass 440 441 return GPUDifferentiationMatrices( 442 device_memory=cuda.to_device( 443 pad_and_join(segments, block_floats*given.float_size())), 444 block_floats=block_floats, 445 matrix_columns=columns)
446