hedge.cuda.plan

1 """Interface with Nvidia CUDA.""" 2 3 from __future__ import division 4 5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 6 7 __license__ = """ 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see U{http://www.gnu.org/licenses/}. 20 """ 21 22 23 24 import numpy 25 from pytools import memoize, memoize_method 26 27 28 29

30 -class Parallelism:

31 """Defines how much of a task is accomplished sequentially vs. in parallel."""

32 - def __init__(self, parallel, serial):

33 self.p = parallel 34 self.s = serial

35

36 - def total(self):

37 return self.p*self.s

38

39 - def __str__(self):

40 return "(p%d s%d)" % (self.p, self.s)

41 42 43 44

45 -def optimize_plan(plan_generator, max_func):

46 plans = list(p for p in plan_generator() 47 if p.invalid_reason() is None) 48 49 if not plans: 50 raise RuntimeError, "no valid CUDA execution plans found" 51 52 desired_occup = max(plan.occupancy_record().occupancy for plan in plans) 53 #if desired_occup > 0.75: 54 # see http://forums.nvidia.com/lofiversion/index.php?t67766.html 55 #desired_occup = 0.75 56 57 from pytools import argmax2 58 return argmax2((p, max_func(p)) 59 for p in plans 60 if p.occupancy_record().occupancy >= desired_occup - 1e-10 61 )

62 63 64 65

66 -class ExecutionPlan(object):

67 - def __init__(self, devdata):

68 self.devdata = devdata

69

70 - def invalid_reason(self):

71 if self.threads() >= self.devdata.max_threads: 72 return "too many threads" 73 74 if self.shared_mem_use() >= int(self.devdata.shared_memory): 75 return "too much shared memory" 76 77 if self.threads()*self.registers() > self.devdata.registers: 78 return "too many registers" 79 return None

80 81 @memoize_method

82 - def occupancy_record(self):

83 from hedge.cuda.tools import OccupancyRecord 84 return OccupancyRecord(self.devdata, 85 self.threads(), self.shared_mem_use(), 86 registers=self.registers())

87

88 - def __str__(self):

89 return ("regs=%d threads=%d smem=%d occ=%f" % ( 90 self.registers(), 91 self.threads(), 92 self.shared_mem_use(), 93 self.occupancy_record().occupancy, 94 ))

95 96 97 98 99 @memoize

100 -def find_microblock_size(devdata, dofs_per_el, float_size):

101 from hedge.cuda.tools import exact_div, int_ceiling 102 align_size = exact_div(devdata.align_bytes(float_size), float_size) 103 104 for mb_align_chunks in range(1, 256): 105 mb_aligned_floats = align_size * mb_align_chunks 106 mb_elements = mb_aligned_floats // dofs_per_el 107 mb_floats = dofs_per_el*mb_elements 108 overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats 109 if overhead <= 0.05: 110 from pytools import Record 111 return Record( 112 align_size=align_size, 113 elements=mb_elements, 114 aligned_floats=mb_aligned_floats, 115 accesses=mb_align_chunks 116 ) 117 118 assert False, "a valid microblock size was not found"

119 120 121 122

123 -class FluxExecutionPlan(ExecutionPlan):

124 - def __init__(self, devdata, ldis, 125 parallel_faces, mbs_per_block, 126 max_ext_faces=None, max_faces=None, 127 float_type=numpy.float32, 128 ):

129 ExecutionPlan.__init__(self, devdata) 130 self.ldis = ldis 131 self.parallel_faces = parallel_faces 132 self.mbs_per_block = mbs_per_block 133 134 self.max_ext_faces = max_ext_faces 135 self.max_faces = max_faces 136 137 self.float_type = numpy.dtype(float_type) 138 139 self.microblock = find_microblock_size( 140 self.devdata, ldis.node_count(), self.float_size)

141 142 @property

143 - def float_size(self):

144 return self.float_type.itemsize

145

146 - def copy(self, devdata=None, ldis=None, 147 parallel_faces=None, mbs_per_block=None, 148 max_ext_faces=None, max_faces=None, float_type=None):

149 return self.__class__( 150 devdata or self.devdata, 151 ldis or self.ldis, 152 parallel_faces or self.parallel_faces, 153 mbs_per_block or self.mbs_per_block, 154 max_ext_faces or self.max_ext_faces, 155 max_faces or self.max_faces, 156 float_type or self.float_type, 157 )

158

159 - def dofs_per_el(self):

160 return self.ldis.node_count()

161

162 - def dofs_per_face(self):

163 return self.ldis.face_node_count()

164

165 - def faces_per_el(self):

166 return self.ldis.face_count()

167

168 - def face_dofs_per_el(self):

169 return self.ldis.face_node_count()*self.faces_per_el()

170

171 - def microblocks_per_block(self):

172 return self.mbs_per_block

173

174 - def elements_per_block(self):

175 return self.microblocks_per_block()*self.microblock.elements

176

177 - def dofs_per_block(self):

178 return self.microblocks_per_block()*self.microblock.aligned_floats

179 180 @memoize_method

181 - def estimate_extface_count(self):

182 d = self.ldis.dimensions 183 184 # How many equivalent cubes would I need to tesselate the same space 185 # as the elements in my thread block? 186 from pytools import factorial 187 equiv_cubes = self.elements_per_block() / factorial(d) 188 189 # If these cubes in turn formed a perfect macro-cube, how long would 190 # its side be? 191 macrocube_side = equiv_cubes ** (1/d) 192 193 # What total face area does the macro-cube have? 194 macrocube_face_area = 2*d * macrocube_side ** (d-1) 195 196 # How many of my faces do I need to tesselate this face area? 197 return macrocube_face_area * factorial(d-1)

198

199 - def get_extface_count(self):

200 from hedge.cuda.tools import int_ceiling 201 202 if self.max_ext_faces is None: 203 return int_ceiling(self.estimate_extface_count()) 204 else: 205 return self.max_ext_faces

206 207 @memoize_method

208 - def face_count(self):

209 if self.max_faces is not None: 210 return self.max_faces 211 else: 212 return (self.elements_per_block() * self.faces_per_el() + 213 self.get_extface_count())

214

215 - def face_pair_count(self):

216 return (self.face_count()+1) // 2

217

218 - def face_dofs_per_microblock(self):

219 return self.microblock.elements*self.faces_per_el()*self.dofs_per_face()

220

221 - def aligned_face_dofs_per_microblock(self):

222 return self.devdata.align_dtype( 223 self.face_dofs_per_microblock(), 224 self.float_size)

225 226 @memoize_method

227 - def shared_mem_use(self):

228 from hedge.cuda.execute import face_pair_struct 229 d = self.ldis.dimensions 230 231 if self.dofs_per_face() > 255: 232 index_lists_entry_size = 2 233 else: 234 index_lists_entry_size = 1 235 236 return (128 # parameters, block header, small extra stuff 237 + self.aligned_face_dofs_per_microblock() 238 * self.microblocks_per_block() 239 * self.float_size 240 + len(face_pair_struct(self.float_type, d))*self.face_pair_count() 241 + index_lists_entry_size*20*self.dofs_per_face() 242 )

243

244 - def threads(self):

245 return self.parallel_faces*self.dofs_per_face()

246

247 - def registers(self):

248 return 12

249 250 @memoize_method

251 - def diff_plan(self):

252 def generate_valid_plans(): 253 from hedge.cuda.tools import int_ceiling 254 255 chunk_sizes = range(self.microblock.align_size, 256 self.microblock.elements*self.dofs_per_el()+1, 257 self.microblock.align_size) 258 259 for pe in range(1,32): 260 from hedge.cuda.tools import int_ceiling 261 localop_par = Parallelism(pe, 256//pe) 262 for chunk_size in chunk_sizes: 263 yield DiffExecutionPlan(self, localop_par, chunk_size)

264 265 return optimize_plan( 266 generate_valid_plans, 267 lambda plan: plan.parallelism.total() 268 )

269 270 @memoize_method

271 - def flux_lifting_plan(self):

272 def generate_valid_plans(): 273 from hedge.cuda.tools import int_ceiling 274 275 chunk_sizes = range(self.microblock.align_size, 276 self.microblock.elements*self.dofs_per_el()+1, 277 self.microblock.align_size) 278 279 for pe in range(1,32): 280 from hedge.cuda.tools import int_ceiling 281 localop_par = Parallelism(pe, 256//pe) 282 for chunk_size in chunk_sizes: 283 yield FluxLiftingExecutionPlan(self, localop_par, chunk_size)

284 285 return optimize_plan( 286 generate_valid_plans, 287 lambda plan: plan.parallelism.total() 288 ) 289

290 - def __str__(self):

291 return ("%s pfaces=%d mbs_per_block=%d mb_elements=%d" % ( 292 ExecutionPlan.__str__(self), 293 self.parallel_faces, 294 self.mbs_per_block, 295 self.microblock.elements, 296 ))

297 298 299 300

301 -class ChunkedLocalOperatorExecutionPlan(ExecutionPlan):

302 - def __init__(self, flux_plan, parallelism, chunk_size):

303 ExecutionPlan.__init__(self, flux_plan.devdata) 304 self.flux_plan = flux_plan 305 self.parallelism = parallelism 306 self.chunk_size = chunk_size

307

308 - def chunks_per_microblock(self):

309 from hedge.cuda.tools import int_ceiling 310 return int_ceiling( 311 self.flux_plan.microblock.aligned_floats/self.chunk_size)

312

313 - def dofs_per_macroblock(self):

314 return self.parallelism.total() * self.flux_plan.microblock.aligned_floats

315

316 - def max_elements_touched_by_chunk(self):

317 fplan = self.flux_plan 318 319 from hedge.cuda.tools import int_ceiling 320 if fplan.dofs_per_el() > self.chunk_size: 321 return 2 322 else: 323 return int_ceiling(self.chunk_size/fplan.dofs_per_el()) + 1

324 325 @memoize_method

326 - def shared_mem_use(self):

327 fplan = self.flux_plan 328 329 return (64 # parameters, block header, small extra stuff 330 + fplan.float_size * ( 331 # chunk of the differentiation matrix 332 + self.chunk_size # this many rows 333 * self.columns() 334 # fetch buffer for each chunk 335 + self.parallelism.p 336 * self.chunk_size 337 * self.fetch_buffer_chunks() 338 ) 339 )

340

341 - def threads(self):

342 return self.parallelism.p*self.chunk_size

343

344 - def __str__(self):

345 return ("%s par=%s chunk_size=%d" % ( 346 ExecutionPlan.__str__(self), 347 self.parallelism, 348 self.chunk_size, 349 ))

350 351 352 353 354

355 -class DiffExecutionPlan(ChunkedLocalOperatorExecutionPlan):

356 - def columns(self):

357 fplan = self.flux_plan 358 return fplan.dofs_per_el() * fplan.ldis.dimensions # r,s,t

359

360 - def registers(self):

361 return 17

362

363 - def fetch_buffer_chunks(self):

364 return 0

365 366 367 368 369

370 -class FluxLiftingExecutionPlan(ChunkedLocalOperatorExecutionPlan):

371 - def columns(self):

372 return self.flux_plan.face_dofs_per_el()

373

374 - def registers(self):

375 return 13

376

377 - def fetch_buffer_chunks(self):

378 return 1

379 380 381

382 -def _test_planner():

383 from hedge.element import TetrahedralElement 384 for order in [3]: 385 for pe in range(2,16): 386 for se in range(1,16): 387 flux_par = Parallelism(pe, se) 388 plan = ExecutionPlan(TetrahedralElement(order), flux_par) 389 inv_reas = plan.invalid_reason() 390 if inv_reas is None: 391 print "o%d %s: smem=%d extfacepairs=%d/%d occ=%f (%s) lop:%s" % ( 392 order, flux_par, 393 plan.shared_mem_use(), 394 plan.estimate_extface_count(), 395 plan.face_count()//2, 396 plan.occupancy().occupancy, 397 plan.occupancy().limited_by, 398 plan.find_localop_par() 399 ) 400 else: 401 print "o%d p%d s%d: %s" % (order, pe, se, inv_reas)

402 403 404 405 406 if __name__ == "__main__": 407 import pycuda.driver as drv 408 drv.init() 409 410 _test_planner() 411

Source Code for Module hedge.cuda.plan