1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 from pytools import memoize_method
31 """Defines how much of a task is accomplished sequentially vs. in-line parallel
32 vs. completely in parallel.
33
34 To fix terminology:
35
36 - "parallel" means "in separate threads".
37 - "inline" means "in the same thread, but sharing some data."
38 - "serial" means "in the same thread, but in separate, data-independent stages."
39 """
40 - def __init__(self, parallel, inline, serial):
41 self.parallel = parallel
42 self.inline = inline
43 self.serial = serial
44
46 return self.parallel*self.inline*self.serial
47
49 return "(%s)" % (" ".join("%s%d" % (cat, count) for cat, count in [
50 ("p", self.parallel), ("i", self.inline), ("s", self.serial)]
51 if count != 1))
52
53
54
55
56 -def optimize_plan(opt_name, plan_generator, target_func, maximize, debug_flags=set(), occupancy_slack=0.5,
57 log_filename=None):
58 plans = [p for p in plan_generator() if p.invalid_reason() is None]
59
60 debug = "cuda_%s_plan" % opt_name in debug_flags
61 show_progress = ("cuda_plan_no_progress" not in debug_flags) and not debug
62
63 if "cuda_plan_log" not in debug_flags:
64 log_filename = None
65
66 if not plans:
67 raise RuntimeError, "no valid CUDA execution plans found"
68
69 if set(["cuda_no_plan", "cuda_no_plan_"+opt_name]) & debug_flags:
70 from pytools import argmax2
71 return argmax2((plan, plan.occupancy_record().occupancy)
72 for plan in plans), 0
73
74 max_occup = max(plan.occupancy_record().occupancy for plan in plans)
75 desired_occup = occupancy_slack*max_occup
76
77 if log_filename is not None:
78 from pytools import single_valued
79 feature_columns = single_valued(p.feature_columns() for p in plans)
80 feature_names = [fc.split()[0] for fc in feature_columns]
81
82 try:
83 import sqlite3 as sqlite
84 except ImportError:
85 from pysqlite2 import dbapi2 as sqlite
86
87 db_conn = sqlite.connect("plan-%s.dat" % log_filename)
88
89 try:
90 db_conn.execute("""
91 create table data (
92 id integer primary key autoincrement,
93 %s,
94 value real)"""
95 % ", ".join(feature_columns))
96 except sqlite.OperationalError:
97 pass
98
99 if show_progress:
100 from pytools import ProgressBar
101 pbar = ProgressBar("plan "+opt_name, len(plans))
102 try:
103 plan_values = []
104 for p in plans:
105 if show_progress:
106 pbar.progress()
107
108 if p.occupancy_record().occupancy >= desired_occup - 1e-10:
109 if debug:
110 print "<---- trying %s:" % p
111
112 value = target_func(p)
113 if isinstance(value, tuple):
114 extra_info = value[1:]
115 value = value[0]
116 else:
117 extra_info = None
118
119 if value is not None:
120 if debug:
121 print "----> yielded %g" % (value)
122 plan_values.append(((len(plan_values), p), value))
123
124 if log_filename is not None:
125 db_conn.execute(
126 "insert into data (%s,value) values (%s)"
127 % (", ".join(feature_names),
128 ",".join(["?"]*(1+len(feature_names)))),
129 p.features(*extra_info)+(value,))
130 finally:
131 if show_progress:
132 pbar.finished()
133
134 if log_filename is not None:
135 db_conn.commit()
136
137 from pytools import argmax2, argmin2
138 if maximize:
139 num_plan, plan = argmax2(plan_values)
140 else:
141 num_plan, plan = argmin2(plan_values)
142
143 plan_value = plan_values[num_plan][1]
144
145 if debug:
146 print "----------------------------------------------"
147 print "chosen: %s" % plan
148 print "value: %g" % plan_value
149 print "----------------------------------------------"
150
151 return plan, plan_value
152
160
162 try:
163 self.occupancy_record()
164 return None
165 except ValueError, ve:
166 return str(ve)
167
168 return None
169
171 regs = self.registers()
172
173 from pycuda.tools import OccupancyRecord
174 while True:
175 try:
176 OccupancyRecord(self.given.devdata,
177 self.threads(), self.shared_mem_use(),
178 registers=regs+1)
179 except ValueError:
180 return regs
181
182 regs += 1
183
184 @memoize_method
190
199
204 - def __init__(self, devdata, ldis, allow_microblocking, float_type):
205 self.devdata = devdata
206 self.ldis = ldis
207 self.float_type = numpy.dtype(float_type)
208
209 self.microblock = self._find_microblock_size(allow_microblocking)
210
212 return self.float_type.itemsize
213
215 return self.ldis.order
216
217 @memoize_method
220
221 @memoize_method
224
227
230
233
234 @memoize_method
239
241 from hedge.backends.cuda.tools import int_ceiling
242 align_size = self.devdata.align_words(self.float_size())
243
244 from pytools import Record
245 class MicroblockInfo(Record): pass
246
247 if not allow_microblocking:
248 return MicroblockInfo(
249 align_size=align_size,
250 elements=1,
251 aligned_floats=int_ceiling(self.dofs_per_el(), align_size)
252 )
253
254 for mb_align_chunks in range(1, 256):
255 mb_aligned_floats = align_size * mb_align_chunks
256 mb_elements = mb_aligned_floats // self.dofs_per_el()
257 mb_floats = self.dofs_per_el()*mb_elements
258 overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats
259 if overhead <= 0.05:
260 return MicroblockInfo(
261 align_size=align_size,
262 elements=mb_elements,
263 aligned_floats=mb_aligned_floats,
264 )
265
266 assert False, "a valid microblock size was not found"
267
268 - def post_decomposition(self, block_count, microblocks_per_block):
269 self.block_count = block_count
270 self.microblocks_per_block = microblocks_per_block
271
272
282
285
288
291
297 - def __init__(self, given, parallelism, segment_size, max_unroll):
298 ExecutionPlan.__init__(self, given.devdata)
299 self.given = given
300 self.parallelism = parallelism
301 self.segment_size = segment_size
302 self.max_unroll = max_unroll
303
308
310 return self.parallelism.total() * self.given.microblock.aligned_floats
311
313 return self.parallelism.total() * self.aligned_preimage_dofs_per_microblock
314
323
324 @memoize_method
326 given = self.given
327
328 return (128
329 + given.float_size() * (
330
331 + self.segment_size
332 * self.columns()
333
334 + self.parallelism.parallel*self.parallelism.inline
335 * self.segment_size
336 * self.fetch_buffer_segments()
337 )
338 )
339
341 return self.parallelism.parallel*self.segment_size
342
344 return ("seg_matrix %s par=%s segment_size=%d unroll=%d" % (
345 ExecutionPlan.__str__(self),
346 self.parallelism,
347 self.segment_size,
348 self.max_unroll))
349
354 - def __init__(self, given, parallelism, max_unroll):
355 ExecutionPlan.__init__(self, given.devdata)
356 self.given = given
357 self.parallelism = parallelism
358 self.max_unroll = max_unroll
359
361 return self.parallelism.total() * self.given.microblock.aligned_floats
362
364 return (self.parallelism.total()
365 * self.aligned_preimage_dofs_per_microblock)
366
368 return self.parallelism.parallel * self.given.microblock.aligned_floats
369
371 return "smem_field %s par=%s unroll=%d" % (
372 ExecutionPlan.__str__(self),
373 self.parallelism,
374 self.max_unroll)
375
376
377
378
379
380 MAX_INLINE = 6
386 def generate_plans():
387 segment_sizes = range(given.microblock.align_size,
388 given.microblock.elements*given.dofs_per_el()+1,
389 given.microblock.align_size)
390
391 from hedge.backends.cuda.diff_shared_segmat import ExecutionPlan as SSegPlan
392
393 if "cuda_no_smem_matrix" not in discr.debug:
394 for pe in range(1,32+1):
395 for inline in range(1, MAX_INLINE+1):
396 for seq in range(1, 4):
397 for segment_size in segment_sizes:
398 yield SSegPlan(
399 given, Parallelism(pe, inline, seq),
400 segment_size,
401 max_unroll=given.dofs_per_el())
402
403 from hedge.backends.cuda.diff_shared_fld import ExecutionPlan as SFieldPlan
404
405 for pe in range(1,32+1):
406 for inline in range(1, MAX_INLINE+1):
407 yield SFieldPlan(given, Parallelism(pe, inline, 1),
408 max_unroll=given.dofs_per_el())
409
410 def target_func(plan):
411 return plan.make_kernel(discr).benchmark()
412
413 from hedge.backends.cuda.plan import optimize_plan
414 return optimize_plan("diff", generate_plans, target_func, maximize=False,
415 debug_flags=discr.debug,
416 log_filename="diff-%d" % given.order())
417
418
419
420
421 -def make_element_local_plan(discr, given,
422 op_name, aligned_preimage_dofs_per_microblock,
423 preimage_dofs_per_el, with_index_check):
424 def generate_plans():
425 if "cuda_no_smem_matrix" not in discr.debug:
426 from hedge.backends.cuda.el_local_shared_segmat import ExecutionPlan as SSegPlan
427
428 for use_prefetch_branch in [True]:
429
430 segment_sizes = range(given.microblock.align_size,
431 given.microblock.elements*given.dofs_per_el()+1,
432 given.microblock.align_size)
433
434 for pe in range(1,32+1):
435 for inline in range(1, MAX_INLINE+1):
436 for seq in range(1, 4+1):
437 for segment_size in segment_sizes:
438 yield SSegPlan(given,
439 Parallelism(pe, inline, seq),
440 segment_size,
441 max_unroll=preimage_dofs_per_el,
442 use_prefetch_branch=use_prefetch_branch,
443
444 debug_name="cuda_%s" % op_name,
445 aligned_preimage_dofs_per_microblock=
446 aligned_preimage_dofs_per_microblock,
447 preimage_dofs_per_el=preimage_dofs_per_el)
448
449 from hedge.backends.cuda.el_local_shared_fld import ExecutionPlan as SFieldPlan
450
451 for pe in range(1,32+1):
452 for inline in range(1, MAX_INLINE):
453 yield SFieldPlan(given, Parallelism(pe, inline, 1),
454 max_unroll=preimage_dofs_per_el,
455
456 debug_name="cuda_%s" % op_name,
457 aligned_preimage_dofs_per_microblock=
458 aligned_preimage_dofs_per_microblock,
459 preimage_dofs_per_el=preimage_dofs_per_el)
460
461 def target_func(plan):
462 return (plan
463 .make_kernel(discr, with_index_check=with_index_check)
464 .benchmark())
465
466 from hedge.backends.cuda.plan import optimize_plan
467 return optimize_plan(
468 op_name, generate_plans, target_func, maximize=False,
469 debug_flags=discr.debug,
470 log_filename="%s-%d" % (op_name, given.order()))
471