Package hedge :: Package cuda :: Module plan
[frames] | no frames]

Source Code for Module hedge.cuda.plan

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24  import numpy 
 25  from pytools import memoize, memoize_method 
 26   
 27   
 28   
 29   
30 -class Parallelism:
31 """Defines how much of a task is accomplished sequentially vs. in parallel."""
32 - def __init__(self, parallel, serial):
33 self.p = parallel 34 self.s = serial
35
36 - def total(self):
37 return self.p*self.s
38
39 - def __str__(self):
40 return "(p%d s%d)" % (self.p, self.s)
41 42 43 44
45 -def optimize_plan(plan_generator, max_func):
46 plans = list(p for p in plan_generator() 47 if p.invalid_reason() is None) 48 49 if not plans: 50 raise RuntimeError, "no valid CUDA execution plans found" 51 52 desired_occup = max(plan.occupancy_record().occupancy for plan in plans) 53 #if desired_occup > 0.75: 54 # see http://forums.nvidia.com/lofiversion/index.php?t67766.html 55 #desired_occup = 0.75 56 57 from pytools import argmax2 58 return argmax2((p, max_func(p)) 59 for p in plans 60 if p.occupancy_record().occupancy >= desired_occup - 1e-10 61 )
62 63 64 65
66 -class ExecutionPlan(object):
67 - def __init__(self, devdata):
68 self.devdata = devdata
69
70 - def invalid_reason(self):
71 if self.threads() >= self.devdata.max_threads: 72 return "too many threads" 73 74 if self.shared_mem_use() >= int(self.devdata.shared_memory): 75 return "too much shared memory" 76 77 if self.threads()*self.registers() > self.devdata.registers: 78 return "too many registers" 79 return None
80 81 @memoize_method
82 - def occupancy_record(self):
83 from hedge.cuda.tools import OccupancyRecord 84 return OccupancyRecord(self.devdata, 85 self.threads(), self.shared_mem_use(), 86 registers=self.registers())
87
88 - def __str__(self):
89 return ("regs=%d threads=%d smem=%d occ=%f" % ( 90 self.registers(), 91 self.threads(), 92 self.shared_mem_use(), 93 self.occupancy_record().occupancy, 94 ))
95 96 97 98 99 @memoize
100 -def find_microblock_size(devdata, dofs_per_el, float_size):
101 from hedge.cuda.tools import exact_div, int_ceiling 102 align_size = exact_div(devdata.align_bytes(float_size), float_size) 103 104 for mb_align_chunks in range(1, 256): 105 mb_aligned_floats = align_size * mb_align_chunks 106 mb_elements = mb_aligned_floats // dofs_per_el 107 mb_floats = dofs_per_el*mb_elements 108 overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats 109 if overhead <= 0.05: 110 from pytools import Record 111 return Record( 112 align_size=align_size, 113 elements=mb_elements, 114 aligned_floats=mb_aligned_floats, 115 accesses=mb_align_chunks 116 ) 117 118 assert False, "a valid microblock size was not found"
119 120 121 122
123 -class FluxExecutionPlan(ExecutionPlan):
124 - def __init__(self, devdata, ldis, 125 parallel_faces, mbs_per_block, 126 max_ext_faces=None, max_faces=None, 127 float_type=numpy.float32, 128 ):
129 ExecutionPlan.__init__(self, devdata) 130 self.ldis = ldis 131 self.parallel_faces = parallel_faces 132 self.mbs_per_block = mbs_per_block 133 134 self.max_ext_faces = max_ext_faces 135 self.max_faces = max_faces 136 137 self.float_type = numpy.dtype(float_type) 138 139 self.microblock = find_microblock_size( 140 self.devdata, ldis.node_count(), self.float_size)
141 142 @property
143 - def float_size(self):
144 return self.float_type.itemsize
145
146 - def copy(self, devdata=None, ldis=None, 147 parallel_faces=None, mbs_per_block=None, 148 max_ext_faces=None, max_faces=None, float_type=None):
149 return self.__class__( 150 devdata or self.devdata, 151 ldis or self.ldis, 152 parallel_faces or self.parallel_faces, 153 mbs_per_block or self.mbs_per_block, 154 max_ext_faces or self.max_ext_faces, 155 max_faces or self.max_faces, 156 float_type or self.float_type, 157 )
158
159 - def dofs_per_el(self):
160 return self.ldis.node_count()
161
162 - def dofs_per_face(self):
163 return self.ldis.face_node_count()
164
165 - def faces_per_el(self):
166 return self.ldis.face_count()
167
168 - def face_dofs_per_el(self):
169 return self.ldis.face_node_count()*self.faces_per_el()
170
171 - def microblocks_per_block(self):
172 return self.mbs_per_block
173
174 - def elements_per_block(self):
175 return self.microblocks_per_block()*self.microblock.elements
176
177 - def dofs_per_block(self):
178 return self.microblocks_per_block()*self.microblock.aligned_floats
179 180 @memoize_method
181 - def estimate_extface_count(self):
182 d = self.ldis.dimensions 183 184 # How many equivalent cubes would I need to tesselate the same space 185 # as the elements in my thread block? 186 from pytools import factorial 187 equiv_cubes = self.elements_per_block() / factorial(d) 188 189 # If these cubes in turn formed a perfect macro-cube, how long would 190 # its side be? 191 macrocube_side = equiv_cubes ** (1/d) 192 193 # What total face area does the macro-cube have? 194 macrocube_face_area = 2*d * macrocube_side ** (d-1) 195 196 # How many of my faces do I need to tesselate this face area? 197 return macrocube_face_area * factorial(d-1)
198
199 - def get_extface_count(self):
200 from hedge.cuda.tools import int_ceiling 201 202 if self.max_ext_faces is None: 203 return int_ceiling(self.estimate_extface_count()) 204 else: 205 return self.max_ext_faces
206 207 @memoize_method
208 - def face_count(self):
209 if self.max_faces is not None: 210 return self.max_faces 211 else: 212 return (self.elements_per_block() * self.faces_per_el() + 213 self.get_extface_count())
214
215 - def face_pair_count(self):
216 return (self.face_count()+1) // 2
217
218 - def face_dofs_per_microblock(self):
219 return self.microblock.elements*self.faces_per_el()*self.dofs_per_face()
220
222 return self.devdata.align_dtype( 223 self.face_dofs_per_microblock(), 224 self.float_size)
225 226 @memoize_method
227 - def shared_mem_use(self):
228 from hedge.cuda.execute import face_pair_struct 229 d = self.ldis.dimensions 230 231 if self.dofs_per_face() > 255: 232 index_lists_entry_size = 2 233 else: 234 index_lists_entry_size = 1 235 236 return (128 # parameters, block header, small extra stuff 237 + self.aligned_face_dofs_per_microblock() 238 * self.microblocks_per_block() 239 * self.float_size 240 + len(face_pair_struct(self.float_type, d))*self.face_pair_count() 241 + index_lists_entry_size*20*self.dofs_per_face() 242 )
243
244 - def threads(self):
245 return self.parallel_faces*self.dofs_per_face()
246
247 - def registers(self):
248 return 12
249 250 @memoize_method
251 - def diff_plan(self):
252 def generate_valid_plans(): 253 from hedge.cuda.tools import int_ceiling 254 255 chunk_sizes = range(self.microblock.align_size, 256 self.microblock.elements*self.dofs_per_el()+1, 257 self.microblock.align_size) 258 259 for pe in range(1,32): 260 from hedge.cuda.tools import int_ceiling 261 localop_par = Parallelism(pe, 256//pe) 262 for chunk_size in chunk_sizes: 263 yield DiffExecutionPlan(self, localop_par, chunk_size)
264 265 return optimize_plan( 266 generate_valid_plans, 267 lambda plan: plan.parallelism.total() 268 )
269 270 @memoize_method
271 - def flux_lifting_plan(self):
272 def generate_valid_plans(): 273 from hedge.cuda.tools import int_ceiling 274 275 chunk_sizes = range(self.microblock.align_size, 276 self.microblock.elements*self.dofs_per_el()+1, 277 self.microblock.align_size) 278 279 for pe in range(1,32): 280 from hedge.cuda.tools import int_ceiling 281 localop_par = Parallelism(pe, 256//pe) 282 for chunk_size in chunk_sizes: 283 yield FluxLiftingExecutionPlan(self, localop_par, chunk_size)
284 285 return optimize_plan( 286 generate_valid_plans, 287 lambda plan: plan.parallelism.total() 288 ) 289
290 - def __str__(self):
291 return ("%s pfaces=%d mbs_per_block=%d mb_elements=%d" % ( 292 ExecutionPlan.__str__(self), 293 self.parallel_faces, 294 self.mbs_per_block, 295 self.microblock.elements, 296 ))
297 298 299 300
301 -class ChunkedLocalOperatorExecutionPlan(ExecutionPlan):
302 - def __init__(self, flux_plan, parallelism, chunk_size):
303 ExecutionPlan.__init__(self, flux_plan.devdata) 304 self.flux_plan = flux_plan 305 self.parallelism = parallelism 306 self.chunk_size = chunk_size
307
308 - def chunks_per_microblock(self):
309 from hedge.cuda.tools import int_ceiling 310 return int_ceiling( 311 self.flux_plan.microblock.aligned_floats/self.chunk_size)
312
313 - def dofs_per_macroblock(self):
314 return self.parallelism.total() * self.flux_plan.microblock.aligned_floats
315
317 fplan = self.flux_plan 318 319 from hedge.cuda.tools import int_ceiling 320 if fplan.dofs_per_el() > self.chunk_size: 321 return 2 322 else: 323 return int_ceiling(self.chunk_size/fplan.dofs_per_el()) + 1
324 325 @memoize_method
326 - def shared_mem_use(self):
327 fplan = self.flux_plan 328 329 return (64 # parameters, block header, small extra stuff 330 + fplan.float_size * ( 331 # chunk of the differentiation matrix 332 + self.chunk_size # this many rows 333 * self.columns() 334 # fetch buffer for each chunk 335 + self.parallelism.p 336 * self.chunk_size 337 * self.fetch_buffer_chunks() 338 ) 339 )
340
341 - def threads(self):
342 return self.parallelism.p*self.chunk_size
343
344 - def __str__(self):
345 return ("%s par=%s chunk_size=%d" % ( 346 ExecutionPlan.__str__(self), 347 self.parallelism, 348 self.chunk_size, 349 ))
350 351 352 353 354
355 -class DiffExecutionPlan(ChunkedLocalOperatorExecutionPlan):
356 - def columns(self):
357 fplan = self.flux_plan 358 return fplan.dofs_per_el() * fplan.ldis.dimensions # r,s,t
359
360 - def registers(self):
361 return 17
362
363 - def fetch_buffer_chunks(self):
364 return 0
365 366 367 368 369
370 -class FluxLiftingExecutionPlan(ChunkedLocalOperatorExecutionPlan):
371 - def columns(self):
372 return self.flux_plan.face_dofs_per_el()
373
374 - def registers(self):
375 return 13
376
377 - def fetch_buffer_chunks(self):
378 return 1
379 380 381
382 -def _test_planner():
383 from hedge.element import TetrahedralElement 384 for order in [3]: 385 for pe in range(2,16): 386 for se in range(1,16): 387 flux_par = Parallelism(pe, se) 388 plan = ExecutionPlan(TetrahedralElement(order), flux_par) 389 inv_reas = plan.invalid_reason() 390 if inv_reas is None: 391 print "o%d %s: smem=%d extfacepairs=%d/%d occ=%f (%s) lop:%s" % ( 392 order, flux_par, 393 plan.shared_mem_use(), 394 plan.estimate_extface_count(), 395 plan.face_count()//2, 396 plan.occupancy().occupancy, 397 plan.occupancy().limited_by, 398 plan.find_localop_par() 399 ) 400 else: 401 print "o%d p%d s%d: %s" % (order, pe, se, inv_reas)
402 403 404 405 406 if __name__ == "__main__": 407 import pycuda.driver as drv 408 drv.init() 409 410 _test_planner() 411