1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24
25 import pycuda.driver as cuda
26 import numpy
27 from pytools import Record
28 from pytools.log import LogQuantity
29 import hedge.timestep.base
30
31
32
33
37
38
39
40
41
43 quot, rem = divmod(dividend, divisor)
44 assert rem == 0
45 return quot
46
48 """Round C{value} up to be a C{multiple_of} something."""
49
50
51 from math import ceil
52 return int(ceil(value/multiple_of))*multiple_of
53
55 """Round C{value} down to be a C{multiple_of} something."""
56
57
58 from math import floor
59 return int(floor(value/multiple_of))*multiple_of
60
61 -def pad(s, block_size):
62 missing_bytes = block_size - len(s)
63 assert missing_bytes >= 0
64 return s + "\x00"*missing_bytes
65
67 return "".join(pad(b, block_size) for b in blocks)
68
70 from hedge.backends.cuda.tools import pad_and_join
71
72 blocks = ["".join(b) for b in data]
73 block_size = devdata.align(max(len(b) for b in blocks))
74
75 class BlockedDataStructure(Record): pass
76
77 return BlockedDataStructure(
78 blocks=cuda.to_device(pad_and_join(blocks, block_size)),
79 max_per_block=max(len(b) for b in data),
80 block_size=block_size,
81 )
82
83 -def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
84 from hedge.backends.cuda.tools import pad_and_join
85
86
87
88
89 multi_blocks = [
90 ["".join(s) for s in part_data]
91 for part_data, part_decls in multi_item]
92 block_sizes = [
93 max(len(b) for b in part_blocks)
94 for part_blocks in multi_blocks]
95
96 from pytools import single_valued
97 block_count = single_valued(
98 len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)
99
100 from codepy.cgen import Struct, ArrayOf
101
102 struct_members = []
103 for part_data, part_decl in single_item:
104 assert block_count == len(part_data)
105 single_valued(len(block) for block in part_data)
106 struct_members.append(part_decl)
107
108 for part_data, part_decl in multi_item:
109 struct_members.append(
110 ArrayOf(part_decl, max(len(s) for s in part_data)))
111
112 superblocks = []
113 for superblock_num in range(block_count):
114 data = ""
115 for part_data, part_decl in single_item:
116 data += part_data[superblock_num]
117
118 for part_blocks, part_size in zip(multi_blocks, block_sizes):
119 assert block_count == len(part_blocks)
120 data += pad(part_blocks[superblock_num], part_size)
121
122 superblocks.append(data)
123
124 superblock_size = devdata.align(
125 single_valued(len(sb) for sb in superblocks))
126
127 data = pad_and_join(superblocks, superblock_size)
128 assert len(data) == superblock_size*block_count
129
130 class SuperblockedDataStructure(Record): pass
131
132 return SuperblockedDataStructure(
133 struct=Struct(struct_name, struct_members),
134 device_memory=cuda.to_device(data),
135 block_bytes=superblock_size,
136 data=data,
137 **extra_fields
138 )
139
140
141
142
143
144 -def get_load_code(dest, base, bytes, word_type=numpy.uint32,
145 descr=None):
146 from codepy.cgen import \
147 Pointer, POD, Value, \
148 Comment, Block, Line, \
149 Constant, For, Statement
150
151 from codepy.cgen import dtype_to_ctype
152 copy_dtype = numpy.dtype(word_type)
153 copy_dtype_str = dtype_to_ctype(copy_dtype)
154
155 code = []
156 if descr is not None:
157 code.append(Comment(descr))
158
159 code.extend([
160 Block([
161 Constant(Pointer(POD(copy_dtype, "load_base")),
162 ("(%s *) (%s)" % (copy_dtype_str, base))),
163 For("unsigned word_nr = THREAD_NUM",
164 "word_nr*sizeof(int) < (%s)" % bytes,
165 "word_nr += COALESCING_THREAD_COUNT",
166 Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
167 % (copy_dtype_str, dest))
168 ),
169 ]),
170 Line(),
171 ])
172
173 return code
174
175
176
177
178 -def unroll(body_gen, total_number, max_unroll=None, start=0):
179 from codepy.cgen import For, Line, Block
180 from pytools import flatten
181
182 if max_unroll is None:
183 max_unroll = total_number
184
185 result = []
186
187 if total_number > max_unroll:
188 loop_items = (total_number // max_unroll) * max_unroll
189
190 result.extend([
191 For("unsigned j = 0",
192 "j < %d" % loop_items,
193 "j += %d" % max_unroll,
194 Block(list(flatten(
195 body_gen("(j+%d)" % i)
196 for i in range(max_unroll))))
197 ),
198 Line()
199 ])
200 start += loop_items
201
202 result.extend(flatten(
203 body_gen(i) for i in range(start, total_number)))
204
205 return result
206
207
208
209
216
218 self.timer = CallableCollectionTimer(
219 "t_rk4", "Time spent doing algebra in RK4")
220
221 from pytools.log import EventCounter
222 self.flop_counter = EventCounter(
223 "n_flops_rk4",
224 "Number of floating point operations in RK4")
225
226 logmgr.add_quantity(self.timer)
227 logmgr.add_quantity(self.flop_counter)
228
229 self.instrumented = True
230
232 try:
233 self.residual
234 except AttributeError:
235 self.residual = 0*rhs(t, y)
236
237 if self.instrumented:
238 def add_timer(n_flops, t_func):
239 self.timer.add_timer_callable(t_func)
240 self.flop_counter.add(n_flops)
241 else:
242 add_timer = None
243
244 from hedge.tools import mul_add
245
246 for a, b, c in self.coeffs:
247 this_rhs = rhs(t + c*dt, y)
248
249 self.residual = mul_add(a, self.residual, dt, this_rhs,
250 add_timer=add_timer)
251 del this_rhs
252 y = mul_add(1, y, b, self.residual,
253 add_timer=add_timer)
254
255 return y
256
257
258
259
261 """Records the elapsed time returned by a number of callables added
262 via L{add_timer_callable}."""
263 - def __init__(self, name="interval", description=None):
264 LogQuantity.__init__(self, name, "s", description)
265
266 self.callables = []
267
269 self.callables.append(clbl)
270
272 result = sum(clbl() for clbl in self.callables)
273 self.callables = []
274 return result
275
276
277
278
279
282
284 from socket import gethostname
285 cuda_devices = [cuda.Device(i) for i in range(cuda.Device.count())]
286 cuda_devprops = [CudaDevInfo(
287 index=i,
288 name=d.name(),
289 memory=d.total_memory(),
290 multiprocessors=d.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT))
291 for i, d in enumerate(cuda_devices)
292 if dev_filter(d)]
293
294 from boostmpi import gather, scatter
295 all_devprops = gather(comm, (gethostname(), cuda_devprops), 0)
296
297 if comm.rank == 0:
298 host_to_devs = {}
299 rank_to_host = {}
300 for rank, (hostname, cuda_devprops) in enumerate(all_devprops):
301 for dev in cuda_devprops:
302 if hostname in host_to_devs:
303 assert cuda_devprops == host_to_devs[hostname]
304 else:
305 host_to_devs[hostname] = cuda_devprops
306
307 rank_to_host[rank] = hostname
308
309 def grab_device(rank):
310 devs = host_to_devs[rank_to_host[rank]]
311 if not devs:
312 raise RuntimeError("No available CUDA device for rank %d (%s)"
313 % (rank, rank_to_host[rank]))
314 else:
315 return devs.pop(0)
316
317 rank_to_device = [grab_device(rank) for rank in range(len(all_devprops))]
318 for rank, dev_info in enumerate(rank_to_device):
319 print "rank %d (%s) is using %s (idx: %d)" % (
320 rank, rank_to_host[rank], dev_info.name, dev_info.index)
321
322 else:
323 rank_to_device = None
324
325 return cuda.Device(scatter(comm, rank_to_device, 0).index)
326