Package hedge :: Package backends :: Package cuda :: Module execute
[hide private]
[frames] | no frames]

Source Code for Module hedge.backends.cuda.execute

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24  import numpy 
 25  import numpy.linalg as la 
 26  from pytools import memoize_method 
 27  import hedge.optemplate 
 28  from hedge.compiler import OperatorCompilerBase, \ 
 29          Assign, FluxBatchAssign 
 30  import pycuda.driver as cuda 
 31  import pymbolic.mapper.stringifier 
 32  from hedge.backends.exec_common import ExecutionMapperBase 
33 34 35 36 37 # debug stuff ----------------------------------------------------------------- 38 -def get_vec_structure(vec, point_size, segment_size, block_size, 39 other_char=lambda snippet: "."):
40 """Prints a structured view of a vector--one character per `point_size` floats, 41 `segment_size` characters partitioned off by spaces, `block_size` segments 42 per line. 43 44 The caracter printed is either an 'N' if any NaNs are encountered, a zero 45 if the entire snippet is zero, or otherwise whatever `other_char` returns, 46 defaulting to a period. 47 """ 48 49 result = "" 50 for block in range(len(vec) // block_size): 51 struc = "" 52 for segment in range(block_size//segment_size): 53 for point in range(segment_size//point_size): 54 offset = block*block_size + segment*segment_size + point*point_size 55 snippet = vec[offset:offset+point_size] 56 57 if numpy.isnan(snippet).any(): 58 struc += "N" 59 elif (snippet == 0).any(): 60 struc += "0" 61 else: 62 struc += other_char(snippet) 63 64 struc += " " 65 result += struc + "\n" 66 return result
67 134
135 136 137 138 # exec mapper ----------------------------------------------------------------- 139 -class ExecutionMapper(ExecutionMapperBase):
140 - def exec_assign(self, insn):
141 return [(insn.name, self(insn.expr))], []
142
143 - def exec_vector_expr_assign(self, insn):
144 if self.executor.discr.instrumented: 145 def stats_callback(n, vec_expr, t_func): 146 self.executor.discr.vector_math_timer.add_timer_callable(t_func) 147 self.executor.discr.vector_math_flop_counter.add( 148 n*insn.flop_count()) 149 self.executor.discr.gmem_bytes_vector_math.add( 150 self.executor.discr.given.float_size() * n * 151 (len(vec_expr.vector_deps)+len(insn.exprs)))
152 else: 153 stats_callback = None 154 155 if insn.flop_count() == 0: 156 return [(name, self(expr)) 157 for name, expr in zip(insn.names, insn.exprs)], [] 158 else: 159 compiled = insn.compiled(self.executor) 160 return zip(compiled.result_names(), 161 compiled(self, stats_callback)), []
162
163 - def exec_diff_batch_assign(self, insn):
164 field = self.rec(insn.field) 165 166 discr = self.executor.discr 167 if discr.instrumented: 168 discr.diff_counter.add(discr.dimensions) 169 discr.diff_flop_counter.add(discr.dimensions*( 170 self.executor.diff_rst_flops + self.executor.diff_rescale_one_flops)) 171 172 xyz_diff = self.executor.diff_kernel(insn.op_class, field) 173 174 if set(["cuda_diff", "cuda_compare"]) <= discr.debug: 175 field = self.rec(insn.field) 176 f = discr.volume_from_gpu(field) 177 assert not numpy.isnan(f).any(), "Initial field contained NaNs." 178 cpu_xyz_diff = [discr.volume_from_gpu(xd) for xd in xyz_diff] 179 dx = cpu_xyz_diff[0] 180 181 test_discr = discr.test_discr 182 real_dx = test_discr.nabla[0].apply(f.astype(numpy.float64)) 183 184 diff = dx - real_dx 185 186 for i, xd in enumerate(cpu_xyz_diff): 187 if numpy.isnan(xd).any(): 188 self.print_error_structure(xd, xd, xd-xd, 189 eventful_only=False, detail=False) 190 assert False, "Resulting field %d contained NaNs." % i 191 192 from hedge.tools import relative_error 193 rel_err_norm = relative_error(la.norm(diff), la.norm(real_dx)) 194 print "diff", rel_err_norm 195 if not (rel_err_norm < 5e-5): 196 self.print_error_structure(dx, real_dx, diff, 197 eventful_only=False, detail=False) 198 199 assert rel_err_norm < 5e-5 200 201 return [(name, xyz_diff[op.xyz_axis]) 202 for name, op in zip(insn.names, insn.operators)], []
203 204
205 - def exec_flux_batch_assign(self, insn):
206 discr = self.executor.discr 207 208 kernel = insn.kernel(self.executor) 209 all_fofs = kernel(self.rec, discr.fluxlocal_plan) 210 elgroup, = discr.element_groups 211 212 result = [ 213 (name, self.executor.fluxlocal_kernel( 214 fluxes_on_faces, 215 *self.executor.flux_local_data( 216 self.executor.fluxlocal_kernel, elgroup, wdflux.is_lift))) 217 for name, wdflux, fluxes_on_faces in zip( 218 insn.names, insn.fluxes, all_fofs)] 219 220 if discr.instrumented: 221 given = discr.given 222 223 flux_count = len(insn.fluxes) 224 dep_count = len(kernel.interior_deps) 225 226 discr.gather_counter.add( 227 flux_count*dep_count) 228 discr.gather_flop_counter.add( 229 flux_count 230 * given.dofs_per_face() 231 * given.faces_per_el() 232 * len(discr.mesh.elements) 233 * (1 # facejac-mul 234 + 2 * # int+ext 235 3*dep_count # const-mul, normal-mul, add 236 ) 237 ) 238 239 discr.lift_counter.add(flux_count) 240 discr.lift_flop_counter.add(flux_count*self.executor.lift_flops) 241 242 # debug --------------------------------------------------------------- 243 if discr.debug & set(["cuda_lift", "cuda_flux"]): 244 fplan = discr.flux_plan 245 246 for fluxes_on_faces in all_fofs: 247 useful_size = (len(discr.blocks) 248 * given.aligned_face_dofs_per_microblock() 249 * fplan.microblocks_per_block()) 250 fof = fluxes_on_faces.get() 251 252 fof = fof[:useful_size] 253 254 have_used_nans = False 255 for i_b, block in enumerate(discr.blocks): 256 offset = i_b*(given.aligned_face_dofs_per_microblock() 257 *fplan.microblocks_per_block()) 258 size = (len(block.el_number_map) 259 *given.dofs_per_face() 260 *given.faces_per_el()) 261 if numpy.isnan(la.norm(fof[offset:offset+size])).any(): 262 have_used_nans = True 263 264 if have_used_nans: 265 struc = ( given.dofs_per_face(), 266 given.dofs_per_face()*given.faces_per_el(), 267 given.aligned_face_dofs_per_microblock(), 268 ) 269 270 print self.get_vec_structure(fof, *struc) 271 raise RuntimeError("Detected used NaNs in flux gather output.") 272 273 assert not have_used_nans 274 275 if "cuda_lift" in discr.debug: 276 cuda.Context.synchronize() 277 print "NANCHECK" 278 279 for name in insn.names: 280 flux = self.context[name] 281 copied_flux = discr.convert_volume(flux, kind="numpy") 282 contains_nans = numpy.isnan(copied_flux).any() 283 if contains_nans: 284 print "examining", name 285 print_error_structure(discr, 286 copied_flux, copied_flux, copied_flux-copied_flux, 287 eventful_only=True) 288 assert not contains_nans, "Resulting flux contains NaNs." 289 290 return result, []
291
292 - def exec_mass_assign(self, insn):
293 elgroup, = self.executor.discr.element_groups 294 kernel = self.executor.discr.element_local_kernel() 295 return [(insn.name, kernel( 296 self.rec(insn.field), 297 *self.executor.mass_data(kernel, elgroup, insn.op_class)))], []
298
299 300 301 302 # compiler stuff -------------------------------------------------------------- 303 -class VectorExprAssign(Assign):
304 __slots__ = ["compiled"] 305
306 - def get_executor_method(self, executor):
307 return executor.exec_vector_expr_assign
308 309 comment = "compiled" 310 311 @memoize_method
312 - def compiled(self, executor):
313 discr = executor.discr 314 315 from hedge.backends.vector_expr import \ 316 VectorExpressionInfo, simple_result_dtype_getter 317 from hedge.backends.cuda.vector_expr import CompiledVectorExpression 318 return CompiledVectorExpression( 319 [VectorExpressionInfo( 320 name=name, 321 expr=expr, 322 do_not_return=dnr) 323 for name, expr, dnr in zip( 324 self.names, self.exprs, self.do_not_return)], 325 result_dtype_getter=simple_result_dtype_getter, 326 allocator=discr.pool.allocate)
327
328 -class CUDAFluxBatchAssign(FluxBatchAssign):
329 @memoize_method
330 - def get_dependencies(self):
331 deps = set() 332 for wdflux in self.fluxes: 333 deps |= set(wdflux.interior_deps) 334 deps |= set(wdflux.boundary_deps) 335 336 dep_mapper = self.dep_mapper_factory() 337 338 from pytools import flatten 339 return set(flatten(dep_mapper(dep) for dep in deps))
340 341 @memoize_method
342 - def kernel(self, executor):
343 return executor.discr.flux_plan.make_kernel( 344 executor.discr, executor, self.fluxes)
345
346 347 348 -class OperatorCompiler(OperatorCompilerBase):
349 from hedge.backends.cuda.optemplate import \ 350 BoundOperatorCollector \ 351 as bound_op_collector_class 352
353 - def get_contained_fluxes(self, expr):
354 from hedge.backends.cuda.optemplate import FluxCollector 355 return [self.FluxRecord( 356 flux_expr=wdflux, 357 dependencies=set(wdflux.interior_deps) | set(wdflux.boundary_deps), 358 kind="whole-domain") 359 for wdflux in FluxCollector()(expr)]
360
361 - def internal_map_flux(self, wdflux):
362 from hedge.backends.cuda.optemplate import WholeDomainFluxOperator 363 return WholeDomainFluxOperator( 364 wdflux.is_lift, 365 [wdflux.InteriorInfo( 366 flux_expr=ii.flux_expr, 367 field_expr=self.rec(ii.field_expr)) 368 for ii in wdflux.interiors], 369 [wdflux.BoundaryInfo( 370 flux_expr=bi.flux_expr, 371 bpair=self.rec(bi.bpair)) 372 for bi in wdflux.boundaries], 373 wdflux.flux_optemplate)
374
375 - def map_whole_domain_flux(self, wdflux):
376 return self.map_planned_flux(wdflux)
377
378 - def make_flux_batch_assign(self, names, fluxes, kind):
381
382 - def finalize_multi_assign(self, names, exprs, do_not_return, priority):
387
388 389 390 391 -class Executor(object):
392 exec_mapper_class = ExecutionMapper 393
394 - def __init__(self, discr, optemplate, post_bind_mapper):
395 self.discr = discr 396 397 from hedge.tools import diff_rst_flops, diff_rescale_one_flops, \ 398 mass_flops, lift_flops 399 self.diff_rst_flops = diff_rst_flops(discr) 400 self.diff_rescale_one_flops = diff_rescale_one_flops(discr) 401 self.mass_flops = mass_flops(discr) 402 self.lift_flops = sum(lift_flops(fg) for fg in discr.face_groups) 403 404 optemplate_stage1 = self.prepare_optemplate_stage1( 405 optemplate, post_bind_mapper) 406 407 # build a boundary tag bitmap 408 from hedge.optemplate import BoundaryTagCollector 409 self.boundary_tag_to_number = {} 410 for btag in BoundaryTagCollector()(optemplate_stage1): 411 self.boundary_tag_to_number.setdefault(btag, 412 len(self.boundary_tag_to_number)) 413 414 e2bb = self.elface_to_bdry_bitmap = {} 415 416 for btag, bdry_number in self.boundary_tag_to_number.iteritems(): 417 bdry_bit = 1 << bdry_number 418 for elface in discr.mesh.tag_to_boundary.get(btag, []): 419 e2bb[elface] = (e2bb.get(elface, 0) | bdry_bit) 420 421 # compile the optemplate 422 from struct import calcsize 423 self.code = OperatorCompiler( 424 max_vectors_in_batch_expr=220 // calcsize("P") 425 )( 426 self.prepare_optemplate_stage2(discr.mesh, optemplate_stage1)) 427 428 # build the local kernels 429 self.diff_kernel = self.discr.diff_plan.make_kernel(discr) 430 self.fluxlocal_kernel = self.discr.fluxlocal_plan.make_kernel(discr, 431 with_index_check=False)
432 433 @staticmethod
434 - def prepare_optemplate_stage2(mesh, optemplate):
444 445 @staticmethod
446 - def prepare_optemplate_stage1(optemplate, post_bind_mapper=lambda x: x):
447 from hedge.optemplate import OperatorBinder 448 return post_bind_mapper(OperatorBinder()(optemplate))
449 450 @classmethod
451 - def prepare_optemplate(cls, mesh, optemplate, post_bind_mapper=lambda x: x):
452 return cls.prepare_optemplate_stage2(mesh, 453 cls.prepare_optemplate_stage1(optemplate, post_bind_mapper))
454 455 @classmethod
456 - def get_first_flux_batch(cls, mesh, optemplate):
457 compiler = OperatorCompiler() 458 compiler(cls.prepare_optemplate(mesh, optemplate)) 459 460 if compiler.flux_batches: 461 return compiler.flux_batches[0] 462 else: 463 return None
464
465 - def instrument(self):
466 pass
467 468 # actual execution --------------------------------------------------------
469 - def __call__(self, **vars):
470 return self.code.execute( 471 self.discr.exec_mapper_class(vars, self))
472 473 # data caches for execution ----------------------------------------------- 474 @memoize_method
475 - def flux_local_data(self, kernel, elgroup, is_lift):
476 if is_lift: 477 mat = elgroup.local_discretization.lifting_matrix() 478 prep_scaling = kernel.prepare_scaling(elgroup, elgroup.inverse_jacobians) 479 else: 480 mat = elgroup.local_discretization.multi_face_mass_matrix() 481 prep_scaling = None 482 483 prep_mat = kernel.prepare_matrix(mat) 484 485 return prep_mat, prep_scaling
486 487 @memoize_method
488 - def mass_data(self, kernel, elgroup, op_class):
489 return (kernel.prepare_matrix(op_class.matrix(elgroup)), 490 kernel.prepare_scaling(elgroup, op_class.coefficients(elgroup)))
491