1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 import numpy.linalg as la
26 from pytools import memoize_method
27 import hedge.optemplate
28 from hedge.compiler import OperatorCompilerBase, \
29 Assign, FluxBatchAssign
30 import pycuda.driver as cuda
31 import pymbolic.mapper.stringifier
32 from hedge.backends.exec_common import ExecutionMapperBase
33
34
35
36
37
38 -def get_vec_structure(vec, point_size, segment_size, block_size,
39 other_char=lambda snippet: "."):
40 """Prints a structured view of a vector--one character per `point_size` floats,
41 `segment_size` characters partitioned off by spaces, `block_size` segments
42 per line.
43
44 The caracter printed is either an 'N' if any NaNs are encountered, a zero
45 if the entire snippet is zero, or otherwise whatever `other_char` returns,
46 defaulting to a period.
47 """
48
49 result = ""
50 for block in range(len(vec) // block_size):
51 struc = ""
52 for segment in range(block_size//segment_size):
53 for point in range(segment_size//point_size):
54 offset = block*block_size + segment*segment_size + point*point_size
55 snippet = vec[offset:offset+point_size]
56
57 if numpy.isnan(snippet).any():
58 struc += "N"
59 elif (snippet == 0).any():
60 struc += "0"
61 else:
62 struc += other_char(snippet)
63
64 struc += " "
65 result += struc + "\n"
66 return result
67
68
69
70
71 -def print_error_structure(discr, computed, reference, diff,
72 eventful_only=False, detail=True):
73 norm_ref = la.norm(reference)
74 struc_lines = []
75
76 if norm_ref == 0:
77 norm_ref = 1
78
79 from hedge.tools import relative_error
80 numpy.set_printoptions(precision=2, linewidth=130, suppress=True)
81 for block in discr.blocks:
82 add_lines = []
83 struc_line = "%7d " % (block.number * discr.flux_plan.dofs_per_block())
84 i_el = 0
85 eventful = False
86 for mb in block.microblocks:
87 for el in mb:
88 s = discr.find_el_range(el.id)
89 relerr = relative_error(la.norm(diff[s]), norm_ref)
90 if relerr > 1e-4:
91 eventful = True
92 struc_line += "*"
93 if detail:
94 print "block %d, el %d, global el #%d, rel.l2err=%g" % (
95 block.number, i_el, el.id, relerr)
96 print computed[s]
97 print reference[s]
98 print diff[s]
99 print diff[s]/norm_ref
100 print la.norm(diff[s]), norm_ref
101 raw_input()
102 elif numpy.isnan(diff[s]).any():
103 eventful = True
104 struc_line += "N"
105 add_lines.append(str(diff[s]))
106
107 if detail:
108 print "block %d, el %d, global el #%d, rel.l2err=%g" % (
109 block.number, i_el, el.id, relerr)
110 print computed[s]
111 print reference[s]
112 print diff[s]
113 raw_input()
114 else:
115 if numpy.max(numpy.abs(reference[s])) == 0:
116 struc_line += "0"
117 else:
118 if False:
119 print "block %d, el %d, global el #%d, rel.l2err=%g" % (
120 block.number, i_el, el.id, relerr)
121 print computed[s]
122 print reference[s]
123 print diff[s]
124 raw_input()
125 struc_line += "."
126 i_el += 1
127 struc_line += " "
128 if (not eventful_only) or eventful:
129 struc_lines.append(struc_line)
130 if detail:
131 struc_lines.extend(add_lines)
132 print
133 print "\n".join(struc_lines)
134
141 return [(insn.name, self(insn.expr))], []
142
144 if self.executor.discr.instrumented:
145 def stats_callback(n, vec_expr, t_func):
146 self.executor.discr.vector_math_timer.add_timer_callable(t_func)
147 self.executor.discr.vector_math_flop_counter.add(
148 n*insn.flop_count())
149 self.executor.discr.gmem_bytes_vector_math.add(
150 self.executor.discr.given.float_size() * n *
151 (len(vec_expr.vector_deps)+len(insn.exprs)))
152 else:
153 stats_callback = None
154
155 if insn.flop_count() == 0:
156 return [(name, self(expr))
157 for name, expr in zip(insn.names, insn.exprs)], []
158 else:
159 compiled = insn.compiled(self.executor)
160 return zip(compiled.result_names(),
161 compiled(self, stats_callback)), []
162
164 field = self.rec(insn.field)
165
166 discr = self.executor.discr
167 if discr.instrumented:
168 discr.diff_counter.add(discr.dimensions)
169 discr.diff_flop_counter.add(discr.dimensions*(
170 self.executor.diff_rst_flops + self.executor.diff_rescale_one_flops))
171
172 xyz_diff = self.executor.diff_kernel(insn.op_class, field)
173
174 if set(["cuda_diff", "cuda_compare"]) <= discr.debug:
175 field = self.rec(insn.field)
176 f = discr.volume_from_gpu(field)
177 assert not numpy.isnan(f).any(), "Initial field contained NaNs."
178 cpu_xyz_diff = [discr.volume_from_gpu(xd) for xd in xyz_diff]
179 dx = cpu_xyz_diff[0]
180
181 test_discr = discr.test_discr
182 real_dx = test_discr.nabla[0].apply(f.astype(numpy.float64))
183
184 diff = dx - real_dx
185
186 for i, xd in enumerate(cpu_xyz_diff):
187 if numpy.isnan(xd).any():
188 self.print_error_structure(xd, xd, xd-xd,
189 eventful_only=False, detail=False)
190 assert False, "Resulting field %d contained NaNs." % i
191
192 from hedge.tools import relative_error
193 rel_err_norm = relative_error(la.norm(diff), la.norm(real_dx))
194 print "diff", rel_err_norm
195 if not (rel_err_norm < 5e-5):
196 self.print_error_structure(dx, real_dx, diff,
197 eventful_only=False, detail=False)
198
199 assert rel_err_norm < 5e-5
200
201 return [(name, xyz_diff[op.xyz_axis])
202 for name, op in zip(insn.names, insn.operators)], []
203
204
206 discr = self.executor.discr
207
208 kernel = insn.kernel(self.executor)
209 all_fofs = kernel(self.rec, discr.fluxlocal_plan)
210 elgroup, = discr.element_groups
211
212 result = [
213 (name, self.executor.fluxlocal_kernel(
214 fluxes_on_faces,
215 *self.executor.flux_local_data(
216 self.executor.fluxlocal_kernel, elgroup, wdflux.is_lift)))
217 for name, wdflux, fluxes_on_faces in zip(
218 insn.names, insn.fluxes, all_fofs)]
219
220 if discr.instrumented:
221 given = discr.given
222
223 flux_count = len(insn.fluxes)
224 dep_count = len(kernel.interior_deps)
225
226 discr.gather_counter.add(
227 flux_count*dep_count)
228 discr.gather_flop_counter.add(
229 flux_count
230 * given.dofs_per_face()
231 * given.faces_per_el()
232 * len(discr.mesh.elements)
233 * (1
234 + 2 *
235 3*dep_count
236 )
237 )
238
239 discr.lift_counter.add(flux_count)
240 discr.lift_flop_counter.add(flux_count*self.executor.lift_flops)
241
242
243 if discr.debug & set(["cuda_lift", "cuda_flux"]):
244 fplan = discr.flux_plan
245
246 for fluxes_on_faces in all_fofs:
247 useful_size = (len(discr.blocks)
248 * given.aligned_face_dofs_per_microblock()
249 * fplan.microblocks_per_block())
250 fof = fluxes_on_faces.get()
251
252 fof = fof[:useful_size]
253
254 have_used_nans = False
255 for i_b, block in enumerate(discr.blocks):
256 offset = i_b*(given.aligned_face_dofs_per_microblock()
257 *fplan.microblocks_per_block())
258 size = (len(block.el_number_map)
259 *given.dofs_per_face()
260 *given.faces_per_el())
261 if numpy.isnan(la.norm(fof[offset:offset+size])).any():
262 have_used_nans = True
263
264 if have_used_nans:
265 struc = ( given.dofs_per_face(),
266 given.dofs_per_face()*given.faces_per_el(),
267 given.aligned_face_dofs_per_microblock(),
268 )
269
270 print self.get_vec_structure(fof, *struc)
271 raise RuntimeError("Detected used NaNs in flux gather output.")
272
273 assert not have_used_nans
274
275 if "cuda_lift" in discr.debug:
276 cuda.Context.synchronize()
277 print "NANCHECK"
278
279 for name in insn.names:
280 flux = self.context[name]
281 copied_flux = discr.convert_volume(flux, kind="numpy")
282 contains_nans = numpy.isnan(copied_flux).any()
283 if contains_nans:
284 print "examining", name
285 print_error_structure(discr,
286 copied_flux, copied_flux, copied_flux-copied_flux,
287 eventful_only=True)
288 assert not contains_nans, "Resulting flux contains NaNs."
289
290 return result, []
291
298
304 __slots__ = ["compiled"]
305
308
309 comment = "compiled"
310
311 @memoize_method
327
329 @memoize_method
331 deps = set()
332 for wdflux in self.fluxes:
333 deps |= set(wdflux.interior_deps)
334 deps |= set(wdflux.boundary_deps)
335
336 dep_mapper = self.dep_mapper_factory()
337
338 from pytools import flatten
339 return set(flatten(dep_mapper(dep) for dep in deps))
340
341 @memoize_method
343 return executor.discr.flux_plan.make_kernel(
344 executor.discr, executor, self.fluxes)
345
387
392 exec_mapper_class = ExecutionMapper
393
394 - def __init__(self, discr, optemplate, post_bind_mapper):
395 self.discr = discr
396
397 from hedge.tools import diff_rst_flops, diff_rescale_one_flops, \
398 mass_flops, lift_flops
399 self.diff_rst_flops = diff_rst_flops(discr)
400 self.diff_rescale_one_flops = diff_rescale_one_flops(discr)
401 self.mass_flops = mass_flops(discr)
402 self.lift_flops = sum(lift_flops(fg) for fg in discr.face_groups)
403
404 optemplate_stage1 = self.prepare_optemplate_stage1(
405 optemplate, post_bind_mapper)
406
407
408 from hedge.optemplate import BoundaryTagCollector
409 self.boundary_tag_to_number = {}
410 for btag in BoundaryTagCollector()(optemplate_stage1):
411 self.boundary_tag_to_number.setdefault(btag,
412 len(self.boundary_tag_to_number))
413
414 e2bb = self.elface_to_bdry_bitmap = {}
415
416 for btag, bdry_number in self.boundary_tag_to_number.iteritems():
417 bdry_bit = 1 << bdry_number
418 for elface in discr.mesh.tag_to_boundary.get(btag, []):
419 e2bb[elface] = (e2bb.get(elface, 0) | bdry_bit)
420
421
422 from struct import calcsize
423 self.code = OperatorCompiler(
424 max_vectors_in_batch_expr=220 // calcsize("P")
425 )(
426 self.prepare_optemplate_stage2(discr.mesh, optemplate_stage1))
427
428
429 self.diff_kernel = self.discr.diff_plan.make_kernel(discr)
430 self.fluxlocal_kernel = self.discr.fluxlocal_plan.make_kernel(discr,
431 with_index_check=False)
432
433 @staticmethod
444
445 @staticmethod
449
450 @classmethod
454
455 @classmethod
464
467
468
470 return self.code.execute(
471 self.discr.exec_mapper_class(vars, self))
472
473
474 @memoize_method
486
487 @memoize_method
488 - def mass_data(self, kernel, elgroup, op_class):
491