1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 import numpy.linalg as la
26 import hedge.discretization
27 import pycuda.driver as cuda
28 import pycuda.gpuarray as gpuarray
29 from pytools import memoize_method, memoize
30
31
32
33
34
36 """Describes what data is local to each thread block on the GPU.
37
38 @ivar number: The global number of this block.
39 @ivar local_discretization: The L{hedge.element.Element} instance used
40 for elements in this block.
41 @ivar cpu_slices: A list of slices describing the CPU-side
42 storage locations for the block's elements.
43 @ivar microblocks: A list of lists of L{hedge.mesh.Element} instances,
44 each representing the elements in one block, and together representing
45 one block.
46 @ivar el_offsets_list: A lsit containing the offsets of the elements in
47 this block, in order.
48 @ivar el_number_map: A dictionary mapping L{hedge.mesh.Element} instances
49 to their number within this block.
50 """
51 __slots__ = ["number", "local_discretization", "cpu_slices", "microblocks",
52 "el_offsets_list", "el_number_map", "el_number_map"
53 ]
54
55 - def __init__(self, number, local_discretization, cpu_slices, microblocks,
56 el_offsets_list, el_number_map):
57 self.number = number
58 self.local_discretization = local_discretization
59 self.cpu_slices = cpu_slices
60 self.microblocks = microblocks
61 self.el_offsets_list = el_offsets_list
62 self.el_number_map = el_number_map
63
64
65
66
67
69 """Describes where the dofs of an element face are stored.
70
71 @ivar opposite: the L{GPUFacestorage} instance for the face
72 oposite to this one.
73 """
74 __slots__ = ["opposite"]
75
78
80 if self.opposite is None:
81 self.opposite = opp
82 else:
83 assert self.opposite is opp
84
86 """Describes storage locations for a face local to an element in a block.
87
88 @ivar el_face: a tuple C{(element, face_number)}.
89 @ivar cpu_slice: the base index of the element in CPU numbering.
90 @ivar native_index_list_id:
91 @ivar opp_write_index_list_id:
92 @ivar native_block: block in which element is to be found.
93 @ivar face_pair_side:
94 """
95 __slots__ = [
96 "el_face", "cpu_slice",
97 "native_index_list_id", "opp_write_index_list_id",
98 "global_int_flux_index_list_id", "global_ext_flux_index_list_id",
99 "native_block",
100 "face_pair_side"]
101
102 - def __init__(self, el_face, cpu_slice, native_index_list_id,
103 native_block, face_pair_side):
104 GPUFaceStorage.__init__(self)
105 self.el_face = el_face
106 self.cpu_slice = cpu_slice
107 self.native_index_list_id = native_index_list_id
108 self.native_block = native_block
109 self.face_pair_side = face_pair_side
110
112 """Describes storage locations for a boundary face.
113
114 @ivar cpu_bdry_index_in_floats: this face's starting index
115 in the CPU-based TAG_ALL boundary array [floats].
116 @ivar gpu_bdry_index_in_floats: this face's starting index
117 in the GPU-based TAG_ALL boundary array [floats].
118 @ivar face_pair_side:
119 """
120 __slots__ = [
121 "cpu_bdry_index_in_floats",
122 "gpu_bdry_index_in_floats",
123 "face_pair_side",
124 ]
125
126 - def __init__(self,
127 cpu_bdry_index_in_floats,
128 gpu_bdry_index_in_floats,
129 face_pair_side
130 ):
131 GPUFaceStorage.__init__(self)
132 self.cpu_bdry_index_in_floats = cpu_bdry_index_in_floats
133 self.gpu_bdry_index_in_floats = gpu_bdry_index_in_floats
134 self.face_pair_side = face_pair_side
135
136
137
138
139 @memoize
141 mod = cuda.SourceModule("""
142 texture<float, 1, cudaReadModeElementType> field_tex;
143 __global__ void boundarize(float *bfield,
144 unsigned int *to_indices,
145 unsigned int *from_indices,
146 unsigned int n)
147 {
148 int tid = threadIdx.x;
149 int total_threads = gridDim.x*blockDim.x;
150 int cta_start = blockDim.x*blockIdx.x;
151 int i;
152
153 for (i = cta_start + tid; i < n; i += total_threads)
154 {
155 bfield[to_indices[i]] =
156 tex1Dfetch(field_tex, from_indices[i]);
157 }
158 }
159 """)
160
161 return (mod.get_function("boundarize"),
162 mod.get_texref("field_tex"))
163
164
165
166
167
188
189
190
191
193
194 from pymetis import part_graph
195 orig_part_count = part_count = (
196 len(mesh.elements)//flux_plan.elements_per_block()+1)
197 while True:
198 cuts, partition = part_graph(part_count,
199 mesh.element_adjacency_graph(),
200 vweights=[1000]*len(mesh.elements))
201
202
203 block2extifaces = dict((i, 0) for i in range(part_count))
204
205 for (e1, f1), (e2, f2) in mesh.both_interfaces():
206 b1 = partition[e1.id]
207 b2 = partition[e2.id]
208
209 if b1 != b2:
210 block2extifaces[b1] += 1
211
212 for el, face_nbr in mesh.tag_to_boundary[hedge.mesh.TAG_ALL]:
213 b1 = partition[el.id]
214 block2extifaces[b1] += 1
215
216 blocks = dict((i, []) for i in range(part_count))
217 for el_id, block in enumerate(partition):
218 blocks[block].append(el_id)
219 block_elements = max(len(block_els) for block_els in blocks.itervalues())
220
221 from hedge.cuda.plan import Parallelism
222 actual_plan = flux_plan.copy(
223 max_ext_faces=max(block2extifaces.itervalues()),
224 max_faces=max(
225 len(blocks[b])*flux_plan.faces_per_el()
226 + block2extifaces[b]
227 for b in range(len(blocks))),
228 )
229 assert actual_plan.max_faces % 2 == 0
230
231 if (block_elements <= actual_plan.elements_per_block()
232 and (flux_plan.occupancy_record().occupancy -
233 actual_plan.occupancy_record().occupancy) < 1e-10):
234 break
235
236 part_count += min(5, int(part_count*0.01))
237
238 print "blocks: theoretical:%d practical:%d" % (orig_part_count, part_count)
239
240 if False:
241 from matplotlib.pylab import hist, show
242 print plan.get_extface_count()
243 hist(block2extifaces.values())
244 show()
245 hist([len(block_els) for block_els in blocks.itervalues()])
246 show()
247
248 return actual_plan, partition
249
250
251
252
253 - def __init__(self, mesh, local_discretization=None,
254 order=None, flux_plan=None, init_cuda=True, debug=False,
255 dev=None, default_scalar_type=numpy.float32):
256 ldis = self.get_local_discretization(mesh, local_discretization, order)
257
258 if init_cuda:
259 cuda.init()
260
261 if dev is None:
262 assert cuda.Device.count() >= 1
263 dev = cuda.Device(0)
264 if isinstance(dev, int):
265 dev = cuda.Device(dev)
266 if init_cuda:
267 self.cuda_context = dev.make_context()
268
269 self.device = dev
270 from hedge.cuda.tools import DeviceData
271 self.devdata = DeviceData(dev)
272
273
274 if flux_plan is None:
275 flux_plan = self._make_plan(ldis, mesh, default_scalar_type)
276 print "projected flux exec plan:", flux_plan
277
278
279 self.flux_plan, self.partition = self._partition_mesh(mesh, flux_plan)
280 del flux_plan
281 print "actual flux exec plan:", self.flux_plan
282 print "actual local op exec plan:", self.flux_plan.diff_plan()
283 print "actual flux local exec plan:", self.flux_plan.flux_lifting_plan()
284
285
286 hedge.discretization.Discretization.__init__(self, mesh, ldis, debug=debug,
287 default_scalar_type=default_scalar_type)
288
289
290 self.blocks = self._build_blocks()
291 self.face_storage_map = self._build_face_storage_map()
292
293
294 from hedge.discr_precompiled import Discretization
295 self.test_discr = Discretization(mesh, ldis)
296
298 block_el_numbers = {}
299 for el_id, block in enumerate(self.partition):
300 block_el_numbers.setdefault(block, []).append(el_id)
301
302 block_count = len(block_el_numbers)
303
304 def make_block(block_num):
305 fplan = self.flux_plan
306
307 microblocks = []
308 current_microblock = []
309 el_offsets_list = []
310 el_number_map = {}
311 elements = [self.mesh.elements[ben] for ben in block_el_numbers[block_num]]
312 for block_el_nr, el in enumerate(elements):
313 el_offset = (
314 len(microblocks)*fplan.microblock.aligned_floats
315 + len(current_microblock)*fplan.dofs_per_el())
316 el_number_map[el] = block_el_nr
317 el_offsets_list.append(el_offset)
318
319 current_microblock.append(el)
320 if len(current_microblock) == fplan.microblock.elements:
321 microblocks.append(current_microblock)
322 current_microblock = []
323
324 if current_microblock:
325 microblocks.append(current_microblock)
326
327 assert len(microblocks) <= fplan.microblocks_per_block()
328
329 eg, = self.element_groups
330 return GPUBlock(block_num,
331 local_discretization=eg.local_discretization,
332 cpu_slices=[self.find_el_range(el.id) for el in elements],
333 microblocks=microblocks,
334 el_offsets_list=el_offsets_list,
335 el_number_map=el_number_map)
336
337 return [make_block(block_num) for block_num in range(block_count)]
338
339
340
341
343
344
345
346 fsm = {}
347
348 from hedge.tools import IndexListRegistry
349 fil_registry = IndexListRegistry(self.debug)
350
351 def make_int_face(face_pair_side):
352 el = self.mesh.elements[face_pair_side.element_id]
353 elface = (el, face_pair_side.face_id)
354
355 block = self.blocks[self.partition[el.id]]
356 iln = fil_registry.register(
357 (ldis, face_pair_side.face_id),
358 lambda: ldis.face_indices()[face_pair_side.face_id]
359 )
360 result = GPUInteriorFaceStorage(
361 elface,
362 cpu_slice=self.find_el_range(el.id),
363 native_index_list_id=iln,
364 native_block=block,
365 face_pair_side=face_pair_side
366 )
367
368 assert elface not in fsm
369 fsm[elface] = result
370 return result
371
372
373 def narrow_ilist(in_el_ilist, native_el_ilist):
374 return get_read_from_map_from_permutation
375
376 el_dof_to_face_dof = dict(
377 (el_dof, i)
378 for i, el_dof in enumerate(native_el_ilist))
379 return tuple(el_dof_to_face_dof[el_dof]
380 for el_dof in in_el_ilist)
381
382 int_fg, = self.face_groups
383 ldis = int_fg.ldis_loc
384 assert ldis == int_fg.ldis_opp
385
386 id_face_index_list_number = fil_registry.register(
387 None,
388 lambda: tuple(xrange(ldis.face_node_count()))
389 )
390 assert id_face_index_list_number == 0
391
392 from pytools import single_valued
393 for fp in int_fg.face_pairs:
394 face1 = make_int_face(fp.loc)
395 face2 = make_int_face(fp.opp)
396 face1.opposite = face2
397 face2.opposite = face1
398
399 def apply_write_map(wmap, sequence):
400 result = [None] * len(sequence)
401 for wm_i, seq_i in zip(wmap, sequence):
402 result[wm_i] = seq_i
403 assert None not in result
404 return tuple(result)
405
406 f_ind = ldis.face_indices()
407
408 def face1_in_el_ilist():
409 return tuple(int_fg.index_lists[
410 fp.loc.face_index_list_number])
411
412 def face2_in_el_ilist():
413 return tuple(int_fg.index_lists[
414 fp.opp.face_index_list_number])
415
416 def opp_write_map():
417 return tuple(
418 int_fg.index_lists[fp.opp_native_write_map])
419
420 face1.global_int_flux_index_list_id = fil_registry.register(
421 (int_fg, fp.loc.face_index_list_number),
422 face1_in_el_ilist)
423 face1.global_ext_flux_index_list_id = fil_registry.register(
424 (int_fg, fp.opp.face_index_list_number),
425 face2_in_el_ilist)
426
427 face2.global_int_flux_index_list_id = fil_registry.register(
428 (int_fg, fp.opp_native_write_map,
429 fp.opp.face_index_list_number),
430 lambda: apply_write_map(
431 opp_write_map(), face2_in_el_ilist())
432 )
433 face2.global_ext_flux_index_list_id = fil_registry.register(
434 (int_fg, fp.opp_native_write_map,
435 fp.loc.face_index_list_number),
436 lambda: apply_write_map(
437 opp_write_map(), face1_in_el_ilist())
438 )
439
440 from pytools import get_write_to_map_from_permutation as gwtm
441
442 face1.opp_write_index_list_id = fil_registry.register(
443 (int_fg, "wtm", fp.opp.face_index_list_number,
444 fp.opp.face_id),
445 lambda: gwtm(face2_in_el_ilist(), f_ind[fp.opp.face_id])
446 )
447 face2.opp_write_index_list_id = fil_registry.register(
448 (int_fg, "wtm",
449 fp.opp_native_write_map,
450 fp.loc.face_index_list_number,
451 fp.loc.face_id),
452 lambda: gwtm(
453 apply_write_map(opp_write_map(), face1_in_el_ilist()),
454 f_ind[fp.loc.face_id])
455 )
456
457 self.aligned_boundary_floats = 0
458 from hedge.mesh import TAG_ALL
459 for bdry_fg in self.get_boundary(TAG_ALL).face_groups:
460 assert ldis == bdry_fg.ldis_loc
461 aligned_fnc = self.devdata.align_dtype(ldis.face_node_count(),
462 self.flux_plan.float_size)
463 for fp in bdry_fg.face_pairs:
464 assert fp.opp.element_id == hedge._internal.INVALID_ELEMENT
465
466
467
468 face1 = make_int_face(fp.loc)
469 face2 = GPUBoundaryFaceStorage(
470 fp.opp.el_base_index,
471 self.aligned_boundary_floats,
472 fp.opp
473 )
474 self.aligned_boundary_floats += aligned_fnc
475 face1.opposite = face2
476 face2.opposite = face1
477
478 face1.global_int_flux_index_list_id = fil_registry.register(
479 (bdry_fg,fp.loc.face_index_list_number),
480 lambda: tuple(bdry_fg.index_lists[
481 fp.loc.face_index_list_number])
482 )
483 face1.global_ext_flux_index_list_id = fil_registry.register(
484 (bdry_fg, fp.opp.face_index_list_number),
485 lambda: tuple(bdry_fg.index_lists[
486 fp.opp.face_index_list_number])
487 )
488
489 self.index_lists = fil_registry.index_lists
490 return fsm
491
492
493
494
496 fplan = self.flux_plan
497 block = self.blocks[self.partition[el.id]]
498
499 mb_nr, in_mb_nr = divmod(block.el_number_map[el], fplan.microblock.elements)
500
501 return (block.number * self.flux_plan.dofs_per_block()
502 + mb_nr*fplan.microblock.aligned_floats
503 + in_mb_nr*fplan.dofs_per_el())
504
506 block = self.blocks[self.partition[el.id]]
507 return block.el_number_map[el]
508
518
520 from hedge.tools import log_shape
521 ls = log_shape(field)
522 if ls != ():
523 result = numpy.array(ls, dtype=object)
524
525 from pytools import indices_in_shape
526
527 for i in indices_in_shape(ls):
528 result[i] = self.volume_to_gpu(field[i])
529 return result
530 else:
531 copy_vec = numpy.empty((self.gpu_dof_count(),), dtype=numpy.float32)
532
533 block_offset = 0
534 block_size = self.flux_plan.dofs_per_block()
535 for block in self.blocks:
536 face_length = block.local_discretization.face_node_count()
537 el_length = block.local_discretization.node_count()
538
539 for el_offset, cpu_slice in zip(
540 block.el_offsets_list, block.cpu_slices):
541 copy_vec[block_offset+el_offset:block_offset+el_offset+el_length] = \
542 field[cpu_slice]
543
544 block_offset += block_size
545
546 return gpuarray.to_gpu(copy_vec)
547
549 from hedge.tools import log_shape
550 ls = log_shape(field)
551 if ls != ():
552 result = numpy.zeros(ls, dtype=object)
553
554 from pytools import indices_in_shape
555
556 for i in indices_in_shape(ls):
557 result[i] = self.volume_from_gpu(field[i])
558 return result
559 else:
560 copied_vec = field.get(pagelocked=True)
561 result = numpy.empty(shape=(len(self),), dtype=copied_vec.dtype)
562
563 block_offset = 0
564 block_size = self.flux_plan.dofs_per_block()
565 for block in self.blocks:
566 el_length = block.local_discretization.node_count()
567
568 for el_offset, cpu_slice in zip(
569 block.el_offsets_list, block.cpu_slices):
570 result[cpu_slice] = \
571 copied_vec[block_offset+el_offset
572 :block_offset+el_offset+el_length]
573
574 block_offset += block_size
575
576 return result
577
578 @memoize_method
580 """Return an array of indices embedding a CPU boundary
581 field for C{tag} into the GPU boundary field."""
582
583 bdry = self.get_boundary(tag)
584 result = numpy.empty(
585 (len(bdry.nodes),),
586 dtype=numpy.intp)
587 result.fill(-1)
588
589 cpu_base = 0
590 for elface in self.mesh.tag_to_boundary.get(tag, []):
591 face_stor = self.face_storage_map[elface]
592 bdry_stor = face_stor.opposite
593 assert isinstance(bdry_stor, GPUBoundaryFaceStorage)
594
595 face_len = (bdry_stor.opposite.native_block
596 .local_discretization.face_node_count())
597 gpu_base = bdry_stor.gpu_bdry_index_in_floats
598 result[cpu_base:cpu_base+face_len] = \
599 numpy.arange(gpu_base, gpu_base+face_len)
600 cpu_base += face_len
601
602 assert (result>=0).all()
603 return result
604
606 from hedge.tools import log_shape
607 ls = log_shape(field)
608 if ls != ():
609 from pytools import indices_in_shape
610 result = numpy.zeros(ls, dtype=object)
611 for i in indices_in_shape(ls):
612 result[i] = self.boundary_to_gpu(tag, field[i])
613 return result
614 else:
615 result = cuda.pagelocked_empty(
616 (self.aligned_boundary_floats,),
617 dtype=field.dtype)
618
619
620
621
622
623
624
625
626
627
628 result.fill(17)
629 result[self.gpu_boundary_embedding(tag)] = field
630 return gpuarray.to_gpu(result)
631
632
638
643
649
650 return self.volume_to_gpu(
651 s.interpolate_volume_function(self, f, tgt_factory))
652
653 - def _new_bdry(self, tag, shape, create_func, dtype):
654 if dtype is None:
655 dtype = self.default_scalar_type
656
657 if shape == ():
658 return create_func((self.aligned_boundary_floats,), dtype=dtype)
659
660 result = numpy.empty(shape, dtype=object)
661 from pytools import indices_in_shape
662 bdry = self.get_boundary(TAG_ALL)
663 for i in indices_in_shape(shape):
664 result[i] = create_func((self.aligned_boundary_floats,), dtype=dtype)
665 return result
666
668 return self._new_bdry(tag, shape, gpuarray.empty, dtype)
669
671 return self._new_bdry(tag, shape, gpuarray.zeros, dtype)
672
677
678 return self.boundary_to_gpu(tag,
679 s.interpolate_boundary_function(self, f, tag, tgt_factory))
680
685
686 return self.boundary_to_gpu(tag,
687 s.boundary_normals(self, tag, tgt_factory))
688
690 raise NotImplementedError
691
692 @memoize_method
694 from_indices = []
695 to_indices = []
696
697 for elface in self.mesh.tag_to_boundary.get(tag, []):
698 vol_face = self.face_storage_map[elface]
699 bdry_face = vol_face.opposite
700 assert isinstance(bdry_face, GPUBoundaryFaceStorage)
701
702 vol_el_index = \
703 self.find_el_gpu_index(vol_face.el_face[0])
704 native_ilist = self.index_lists[vol_face.native_index_list_id]
705 from_indices.extend(vol_el_index+i for i in native_ilist)
706 bdry_index = bdry_face.gpu_bdry_index_in_floats
707 to_indices.extend(
708 xrange(bdry_index, bdry_index+len(native_ilist)))
709
710 return (
711 gpuarray.to_gpu(
712 numpy.array(from_indices, dtype=numpy.uint32)),
713 gpuarray.to_gpu(
714 numpy.array(to_indices, dtype=numpy.uint32)),
715 len(from_indices)
716 )
717
718
720 kernel, field_texref = _boundarize_kernel()
721
722 from_indices, to_indices, idx_count = self._boundarize_info(tag)
723 block_count, threads_per_block, elems_per_block = \
724 gpuarray.splay(idx_count)
725
726 def do_scalar(subfield):
727 from hedge.mesh import TAG_ALL
728 if tag != TAG_ALL:
729 result = self.boundary_zeros(tag)
730 else:
731 result = self.boundary_empty(tag)
732
733 if idx_count:
734 subfield.bind_to_texref(field_texref)
735 kernel(result, to_indices, from_indices,
736 numpy.uint32(idx_count),
737 block=(threads_per_block,1,1), grid=(block_count,1),
738 texrefs=[field_texref])
739 return result
740
741 from hedge.tools import log_shape
742 ls = log_shape(field)
743
744 if ls == ():
745 return do_scalar(field)
746 else:
747 result = numpy.empty(ls, dtype=object)
748 from pytools import indices_in_shape
749 for i in indices_in_shape(ls):
750 result[i] = do_scalar(field[i])
751
752 return result
753
754
755 s = hedge.discretization.Discretization
756 host_volume_empty = s.volume_empty
757 host_volume_zeros = s.volume_zeros
758 del s
759
760
764