Package hedge :: Package backends :: Package cuda :: Module plan
[hide private]
[frames] | no frames]

Source Code for Module hedge.backends.cuda.plan

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24  import numpy 
 25  from pytools import memoize_method 
26 27 28 29 30 -class Parallelism:
31 """Defines how much of a task is accomplished sequentially vs. in-line parallel 32 vs. completely in parallel. 33 34 To fix terminology: 35 36 - "parallel" means "in separate threads". 37 - "inline" means "in the same thread, but sharing some data." 38 - "serial" means "in the same thread, but in separate, data-independent stages." 39 """
40 - def __init__(self, parallel, inline, serial):
41 self.parallel = parallel 42 self.inline = inline 43 self.serial = serial
44
45 - def total(self):
46 return self.parallel*self.inline*self.serial
47
48 - def __str__(self):
49 return "(%s)" % (" ".join("%s%d" % (cat, count) for cat, count in [ 50 ("p", self.parallel), ("i", self.inline), ("s", self.serial)] 51 if count != 1))
52
53 54 55 56 -def optimize_plan(opt_name, plan_generator, target_func, maximize, debug_flags=set(), occupancy_slack=0.5, 57 log_filename=None):
58 plans = [p for p in plan_generator() if p.invalid_reason() is None] 59 60 debug = "cuda_%s_plan" % opt_name in debug_flags 61 show_progress = ("cuda_plan_no_progress" not in debug_flags) and not debug 62 63 if "cuda_plan_log" not in debug_flags: 64 log_filename = None 65 66 if not plans: 67 raise RuntimeError, "no valid CUDA execution plans found" 68 69 if set(["cuda_no_plan", "cuda_no_plan_"+opt_name]) & debug_flags: 70 from pytools import argmax2 71 return argmax2((plan, plan.occupancy_record().occupancy) 72 for plan in plans), 0 73 74 max_occup = max(plan.occupancy_record().occupancy for plan in plans) 75 desired_occup = occupancy_slack*max_occup 76 77 if log_filename is not None: 78 from pytools import single_valued 79 feature_columns = single_valued(p.feature_columns() for p in plans) 80 feature_names = [fc.split()[0] for fc in feature_columns] 81 82 try: 83 import sqlite3 as sqlite 84 except ImportError: 85 from pysqlite2 import dbapi2 as sqlite 86 87 db_conn = sqlite.connect("plan-%s.dat" % log_filename) 88 89 try: 90 db_conn.execute(""" 91 create table data ( 92 id integer primary key autoincrement, 93 %s, 94 value real)""" 95 % ", ".join(feature_columns)) 96 except sqlite.OperationalError: 97 pass 98 99 if show_progress: 100 from pytools import ProgressBar 101 pbar = ProgressBar("plan "+opt_name, len(plans)) 102 try: 103 plan_values = [] 104 for p in plans: 105 if show_progress: 106 pbar.progress() 107 108 if p.occupancy_record().occupancy >= desired_occup - 1e-10: 109 if debug: 110 print "<---- trying %s:" % p 111 112 value = target_func(p) 113 if isinstance(value, tuple): 114 extra_info = value[1:] 115 value = value[0] 116 else: 117 extra_info = None 118 119 if value is not None: 120 if debug: 121 print "----> yielded %g" % (value) 122 plan_values.append(((len(plan_values), p), value)) 123 124 if log_filename is not None: 125 db_conn.execute( 126 "insert into data (%s,value) values (%s)" 127 % (", ".join(feature_names), 128 ",".join(["?"]*(1+len(feature_names)))), 129 p.features(*extra_info)+(value,)) 130 finally: 131 if show_progress: 132 pbar.finished() 133 134 if log_filename is not None: 135 db_conn.commit() 136 137 from pytools import argmax2, argmin2 138 if maximize: 139 num_plan, plan = argmax2(plan_values) 140 else: 141 num_plan, plan = argmin2(plan_values) 142 143 plan_value = plan_values[num_plan][1] 144 145 if debug: 146 print "----------------------------------------------" 147 print "chosen: %s" % plan 148 print "value: %g" % plan_value 149 print "----------------------------------------------" 150 151 return plan, plan_value
152
153 154 155 156 157 -class ExecutionPlan(object):
158 - def __init__(self, given):
159 self.given = given
160
161 - def invalid_reason(self):
162 try: 163 self.occupancy_record() 164 return None 165 except ValueError, ve: 166 return str(ve) 167 168 return None
169
170 - def max_registers(self):
171 regs = self.registers() 172 173 from pycuda.tools import OccupancyRecord 174 while True: 175 try: 176 OccupancyRecord(self.given.devdata, 177 self.threads(), self.shared_mem_use(), 178 registers=regs+1) 179 except ValueError: 180 return regs 181 182 regs += 1
183 184 @memoize_method
185 - def occupancy_record(self):
186 from pycuda.tools import OccupancyRecord 187 return OccupancyRecord(self.given.devdata, 188 self.threads(), self.shared_mem_use(), 189 registers=self.registers())
190
191 - def __str__(self):
192 return ("regs=%d(+%d) threads=%d smem=%d occ=%f" % ( 193 self.registers(), 194 self.max_registers()-self.registers(), 195 self.threads(), 196 self.shared_mem_use(), 197 self.occupancy_record().occupancy, 198 ))
199
200 201 202 203 -class PlanGivenData(object):
204 - def __init__(self, devdata, ldis, allow_microblocking, float_type):
205 self.devdata = devdata 206 self.ldis = ldis 207 self.float_type = numpy.dtype(float_type) 208 209 self.microblock = self._find_microblock_size(allow_microblocking)
210
211 - def float_size(self):
212 return self.float_type.itemsize
213
214 - def order(self):
215 return self.ldis.order
216 217 @memoize_method
218 - def dofs_per_el(self):
219 return self.ldis.node_count()
220 221 @memoize_method
222 - def dofs_per_face(self):
223 return self.ldis.face_node_count()
224
225 - def faces_per_el(self):
226 return self.ldis.face_count()
227
228 - def face_dofs_per_el(self):
229 return self.ldis.face_node_count()*self.faces_per_el()
230
231 - def face_dofs_per_microblock(self):
232 return self.microblock.elements*self.faces_per_el()*self.dofs_per_face()
233 234 @memoize_method
236 return self.devdata.align_dtype( 237 self.face_dofs_per_microblock(), 238 self.float_size())
239
240 - def _find_microblock_size(self, allow_microblocking):
241 from hedge.backends.cuda.tools import int_ceiling 242 align_size = self.devdata.align_words(self.float_size()) 243 244 from pytools import Record 245 class MicroblockInfo(Record): pass 246 247 if not allow_microblocking: 248 return MicroblockInfo( 249 align_size=align_size, 250 elements=1, 251 aligned_floats=int_ceiling(self.dofs_per_el(), align_size) 252 ) 253 254 for mb_align_chunks in range(1, 256): 255 mb_aligned_floats = align_size * mb_align_chunks 256 mb_elements = mb_aligned_floats // self.dofs_per_el() 257 mb_floats = self.dofs_per_el()*mb_elements 258 overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats 259 if overhead <= 0.05: 260 return MicroblockInfo( 261 align_size=align_size, 262 elements=mb_elements, 263 aligned_floats=mb_aligned_floats, 264 ) 265 266 assert False, "a valid microblock size was not found"
267
268 - def post_decomposition(self, block_count, microblocks_per_block):
269 self.block_count = block_count 270 self.microblocks_per_block = microblocks_per_block
271 272 # below methods are available after decomposition has posted
273 - def matmul_preimage_shape(self, matmul_plan):
274 from hedge.backends.cuda.tools import int_ceiling 275 fof_dofs = ( 276 self.block_count 277 * self.microblocks_per_block 278 * matmul_plan.aligned_preimage_dofs_per_microblock) 279 fof_dofs = int_ceiling(fof_dofs, matmul_plan.preimage_dofs_per_macroblock()) 280 281 return (fof_dofs,)
282
283 - def elements_per_block(self):
284 return self.microblocks_per_block * self.microblock.elements
285
286 - def dofs_per_block(self):
287 return self.microblock.aligned_floats * self.microblocks_per_block
288
289 - def total_dofs(self):
290 return self.block_count * self.dofs_per_block()
291
292 293 294 295 296 -class SegmentedMatrixLocalOpExecutionPlan(ExecutionPlan):
297 - def __init__(self, given, parallelism, segment_size, max_unroll):
298 ExecutionPlan.__init__(self, given.devdata) 299 self.given = given 300 self.parallelism = parallelism 301 self.segment_size = segment_size 302 self.max_unroll = max_unroll
303
304 - def segments_per_microblock(self):
305 from hedge.backends.cuda.tools import int_ceiling 306 return int_ceiling( 307 self.given.microblock.aligned_floats/self.segment_size)
308
309 - def dofs_per_macroblock(self):
310 return self.parallelism.total() * self.given.microblock.aligned_floats
311
313 return self.parallelism.total() * self.aligned_preimage_dofs_per_microblock
314
316 given = self.given 317 318 from hedge.backends.cuda.tools import int_ceiling 319 if given.dofs_per_el() > self.segment_size: 320 return 2 321 else: 322 return int_ceiling(self.segment_size/given.dofs_per_el()) + 1
323 324 @memoize_method
325 - def shared_mem_use(self):
326 given = self.given 327 328 return (128 # parameters, block header, small extra stuff 329 + given.float_size() * ( 330 # segment of the local op matrix 331 + self.segment_size # this many rows 332 * self.columns() 333 # fetch buffer for each segment 334 + self.parallelism.parallel*self.parallelism.inline 335 * self.segment_size 336 * self.fetch_buffer_segments() 337 ) 338 )
339
340 - def threads(self):
341 return self.parallelism.parallel*self.segment_size
342
343 - def __str__(self):
344 return ("seg_matrix %s par=%s segment_size=%d unroll=%d" % ( 345 ExecutionPlan.__str__(self), 346 self.parallelism, 347 self.segment_size, 348 self.max_unroll))
349
350 351 352 353 -class SMemFieldLocalOpExecutionPlan(ExecutionPlan):
354 - def __init__(self, given, parallelism, max_unroll):
355 ExecutionPlan.__init__(self, given.devdata) 356 self.given = given 357 self.parallelism = parallelism 358 self.max_unroll = max_unroll
359
360 - def dofs_per_macroblock(self):
361 return self.parallelism.total() * self.given.microblock.aligned_floats
362
364 return (self.parallelism.total() 365 * self.aligned_preimage_dofs_per_microblock)
366
367 - def threads(self):
368 return self.parallelism.parallel * self.given.microblock.aligned_floats
369
370 - def __str__(self):
371 return "smem_field %s par=%s unroll=%d" % ( 372 ExecutionPlan.__str__(self), 373 self.parallelism, 374 self.max_unroll)
375 376 377 378 379 380 MAX_INLINE = 6
381 382 383 384 385 -def make_diff_plan(discr, given):
386 def generate_plans(): 387 segment_sizes = range(given.microblock.align_size, 388 given.microblock.elements*given.dofs_per_el()+1, 389 given.microblock.align_size) 390 391 from hedge.backends.cuda.diff_shared_segmat import ExecutionPlan as SSegPlan 392 393 if "cuda_no_smem_matrix" not in discr.debug: 394 for pe in range(1,32+1): 395 for inline in range(1, MAX_INLINE+1): 396 for seq in range(1, 4): 397 for segment_size in segment_sizes: 398 yield SSegPlan( 399 given, Parallelism(pe, inline, seq), 400 segment_size, 401 max_unroll=given.dofs_per_el()) 402 403 from hedge.backends.cuda.diff_shared_fld import ExecutionPlan as SFieldPlan 404 405 for pe in range(1,32+1): 406 for inline in range(1, MAX_INLINE+1): 407 yield SFieldPlan(given, Parallelism(pe, inline, 1), 408 max_unroll=given.dofs_per_el())
409 410 def target_func(plan): 411 return plan.make_kernel(discr).benchmark() 412 413 from hedge.backends.cuda.plan import optimize_plan 414 return optimize_plan("diff", generate_plans, target_func, maximize=False, 415 debug_flags=discr.debug, 416 log_filename="diff-%d" % given.order()) 417
418 419 420 421 -def make_element_local_plan(discr, given, 422 op_name, aligned_preimage_dofs_per_microblock, 423 preimage_dofs_per_el, with_index_check):
424 def generate_plans(): 425 if "cuda_no_smem_matrix" not in discr.debug: 426 from hedge.backends.cuda.el_local_shared_segmat import ExecutionPlan as SSegPlan 427 428 for use_prefetch_branch in [True]: 429 #for use_prefetch_branch in [True, False]: 430 segment_sizes = range(given.microblock.align_size, 431 given.microblock.elements*given.dofs_per_el()+1, 432 given.microblock.align_size) 433 434 for pe in range(1,32+1): 435 for inline in range(1, MAX_INLINE+1): 436 for seq in range(1, 4+1): 437 for segment_size in segment_sizes: 438 yield SSegPlan(given, 439 Parallelism(pe, inline, seq), 440 segment_size, 441 max_unroll=preimage_dofs_per_el, 442 use_prefetch_branch=use_prefetch_branch, 443 444 debug_name="cuda_%s" % op_name, 445 aligned_preimage_dofs_per_microblock= 446 aligned_preimage_dofs_per_microblock, 447 preimage_dofs_per_el=preimage_dofs_per_el) 448 449 from hedge.backends.cuda.el_local_shared_fld import ExecutionPlan as SFieldPlan 450 451 for pe in range(1,32+1): 452 for inline in range(1, MAX_INLINE): 453 yield SFieldPlan(given, Parallelism(pe, inline, 1), 454 max_unroll=preimage_dofs_per_el, 455 456 debug_name="cuda_%s" % op_name, 457 aligned_preimage_dofs_per_microblock= 458 aligned_preimage_dofs_per_microblock, 459 preimage_dofs_per_el=preimage_dofs_per_el)
460 461 def target_func(plan): 462 return (plan 463 .make_kernel(discr, with_index_check=with_index_check) 464 .benchmark()) 465 466 from hedge.backends.cuda.plan import optimize_plan 467 return optimize_plan( 468 op_name, generate_plans, target_func, maximize=False, 469 debug_flags=discr.debug, 470 log_filename="%s-%d" % (op_name, given.order())) 471