1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 from pytools import memoize, memoize_method
26
27
28
29
31 """Defines how much of a task is accomplished sequentially vs. in parallel."""
35
38
40 return "(p%d s%d)" % (self.p, self.s)
41
42
43
44
46 plans = list(p for p in plan_generator()
47 if p.invalid_reason() is None)
48
49 if not plans:
50 raise RuntimeError, "no valid CUDA execution plans found"
51
52 desired_occup = max(plan.occupancy_record().occupancy for plan in plans)
53
54
55
56
57 from pytools import argmax2
58 return argmax2((p, max_func(p))
59 for p in plans
60 if p.occupancy_record().occupancy >= desired_occup - 1e-10
61 )
62
63
64
65
68 self.devdata = devdata
69
71 if self.threads() >= self.devdata.max_threads:
72 return "too many threads"
73
74 if self.shared_mem_use() >= int(self.devdata.shared_memory):
75 return "too much shared memory"
76
77 if self.threads()*self.registers() > self.devdata.registers:
78 return "too many registers"
79 return None
80
81 @memoize_method
87
95
96
97
98
99 @memoize
101 from hedge.cuda.tools import exact_div, int_ceiling
102 align_size = exact_div(devdata.align_bytes(float_size), float_size)
103
104 for mb_align_chunks in range(1, 256):
105 mb_aligned_floats = align_size * mb_align_chunks
106 mb_elements = mb_aligned_floats // dofs_per_el
107 mb_floats = dofs_per_el*mb_elements
108 overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats
109 if overhead <= 0.05:
110 from pytools import Record
111 return Record(
112 align_size=align_size,
113 elements=mb_elements,
114 aligned_floats=mb_aligned_floats,
115 accesses=mb_align_chunks
116 )
117
118 assert False, "a valid microblock size was not found"
119
120
121
122
124 - def __init__(self, devdata, ldis,
125 parallel_faces, mbs_per_block,
126 max_ext_faces=None, max_faces=None,
127 float_type=numpy.float32,
128 ):
129 ExecutionPlan.__init__(self, devdata)
130 self.ldis = ldis
131 self.parallel_faces = parallel_faces
132 self.mbs_per_block = mbs_per_block
133
134 self.max_ext_faces = max_ext_faces
135 self.max_faces = max_faces
136
137 self.float_type = numpy.dtype(float_type)
138
139 self.microblock = find_microblock_size(
140 self.devdata, ldis.node_count(), self.float_size)
141
142 @property
144 return self.float_type.itemsize
145
146 - def copy(self, devdata=None, ldis=None,
147 parallel_faces=None, mbs_per_block=None,
148 max_ext_faces=None, max_faces=None, float_type=None):
149 return self.__class__(
150 devdata or self.devdata,
151 ldis or self.ldis,
152 parallel_faces or self.parallel_faces,
153 mbs_per_block or self.mbs_per_block,
154 max_ext_faces or self.max_ext_faces,
155 max_faces or self.max_faces,
156 float_type or self.float_type,
157 )
158
160 return self.ldis.node_count()
161
163 return self.ldis.face_node_count()
164
167
170
172 return self.mbs_per_block
173
176
179
180 @memoize_method
182 d = self.ldis.dimensions
183
184
185
186 from pytools import factorial
187 equiv_cubes = self.elements_per_block() / factorial(d)
188
189
190
191 macrocube_side = equiv_cubes ** (1/d)
192
193
194 macrocube_face_area = 2*d * macrocube_side ** (d-1)
195
196
197 return macrocube_face_area * factorial(d-1)
198
206
207 @memoize_method
214
217
220
225
226 @memoize_method
243
246
249
250 @memoize_method
264
265 return optimize_plan(
266 generate_valid_plans,
267 lambda plan: plan.parallelism.total()
268 )
269
270 @memoize_method
284
285 return optimize_plan(
286 generate_valid_plans,
287 lambda plan: plan.parallelism.total()
288 )
289
291 return ("%s pfaces=%d mbs_per_block=%d mb_elements=%d" % (
292 ExecutionPlan.__str__(self),
293 self.parallel_faces,
294 self.mbs_per_block,
295 self.microblock.elements,
296 ))
297
298
299
300
302 - def __init__(self, flux_plan, parallelism, chunk_size):
303 ExecutionPlan.__init__(self, flux_plan.devdata)
304 self.flux_plan = flux_plan
305 self.parallelism = parallelism
306 self.chunk_size = chunk_size
307
312
314 return self.parallelism.total() * self.flux_plan.microblock.aligned_floats
315
324
325 @memoize_method
327 fplan = self.flux_plan
328
329 return (64
330 + fplan.float_size * (
331
332 + self.chunk_size
333 * self.columns()
334
335 + self.parallelism.p
336 * self.chunk_size
337 * self.fetch_buffer_chunks()
338 )
339 )
340
342 return self.parallelism.p*self.chunk_size
343
345 return ("%s par=%s chunk_size=%d" % (
346 ExecutionPlan.__str__(self),
347 self.parallelism,
348 self.chunk_size,
349 ))
350
351
352
353
354
365
366
367
368
369
379
380
381
383 from hedge.element import TetrahedralElement
384 for order in [3]:
385 for pe in range(2,16):
386 for se in range(1,16):
387 flux_par = Parallelism(pe, se)
388 plan = ExecutionPlan(TetrahedralElement(order), flux_par)
389 inv_reas = plan.invalid_reason()
390 if inv_reas is None:
391 print "o%d %s: smem=%d extfacepairs=%d/%d occ=%f (%s) lop:%s" % (
392 order, flux_par,
393 plan.shared_mem_use(),
394 plan.estimate_extface_count(),
395 plan.face_count()//2,
396 plan.occupancy().occupancy,
397 plan.occupancy().limited_by,
398 plan.find_localop_par()
399 )
400 else:
401 print "o%d p%d s%d: %s" % (order, pe, se, inv_reas)
402
403
404
405
406 if __name__ == "__main__":
407 import pycuda.driver as drv
408 drv.init()
409
410 _test_planner()
411