1 from __future__ import division
2
3 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
4
5 __license__ = """
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see U{http://www.gnu.org/licenses/}.
18 """
19
20
21
22 import numpy
23 import hedge.discretization
24 import pycuda.driver as cuda
25 import pycuda.gpuarray as gpuarray
26 from pytools import memoize_method, Record
27 from hedge.tools import Future
34 """Describes what data is local to each thread block on the GPU.
35
36 @ivar number: The global number of this block.
37 @ivar local_discretization: The L{hedge.element.Element} instance used
38 for elements in this block.
39 @ivar cpu_slices: A list of slices describing the CPU-side
40 storage locations for the block's elements.
41 @ivar microblocks: A list of lists of L{hedge.mesh.Element} instances,
42 each representing the elements in one block, and together representing
43 one block.
44 @ivar el_offsets_list: A lsit containing the offsets of the elements in
45 this block, in order.
46 @ivar el_number_map: A dictionary mapping L{hedge.mesh.Element} instances
47 to their number within this block.
48 """
49 __slots__ = ["number", "local_discretization", "cpu_slices", "microblocks",
50 "el_offsets_list", "el_number_map", "el_number_map"
51 ]
52
53 - def __init__(self, number, local_discretization, cpu_slices, microblocks,
54 el_offsets_list, el_number_map):
55 self.number = number
56 self.local_discretization = local_discretization
57 self.cpu_slices = cpu_slices
58 self.microblocks = microblocks
59 self.el_offsets_list = el_offsets_list
60 self.el_number_map = el_number_map
61
67 """Describes where the dofs of an element face are stored.
68
69 @ivar opposite: the L{GPUFaceStorage} instance for the face
70 oposite to this one.
71 """
72 __slots__ = ["opposite"]
73
76
78 if self.opposite is None:
79 self.opposite = opp
80 else:
81 assert self.opposite is opp
82
84 """Describes storage locations for a face local to an element in a block.
85
86 @ivar el_face: a tuple C{(element, face_number)}.
87 @ivar cpu_slice: the base index of the element in CPU numbering.
88 @ivar native_index_list_id:
89 @ivar opp_write_index_list_id:
90 @ivar native_block: block in which element is to be found.
91 @ivar face_pair_side:
92 """
93 __slots__ = [
94 "el_face", "cpu_slice",
95 "native_index_list_id", "opp_write_index_list_id",
96 "global_int_flux_index_list_id", "global_ext_flux_index_list_id",
97 "native_block",
98 "face_pair_side"]
99
100 - def __init__(self, el_face, cpu_slice, native_index_list_id,
101 native_block, face_pair_side):
102 GPUFaceStorage.__init__(self)
103 self.el_face = el_face
104 self.cpu_slice = cpu_slice
105 self.native_index_list_id = native_index_list_id
106 self.native_block = native_block
107 self.face_pair_side = face_pair_side
108
110 """Describes storage locations for a boundary face.
111
112 @ivar cpu_bdry_index_in_floats: this face's starting index
113 in the CPU-based TAG_ALL boundary array [floats].
114 @ivar gpu_bdry_index_in_floats: this face's starting index
115 in the GPU-based TAG_ALL boundary array [floats].
116 @ivar face_pair_side:
117 """
118 __slots__ = [
119 "cpu_bdry_index_in_floats",
120 "gpu_bdry_index_in_floats",
121 "face_pair_side",
122 ]
123
124 - def __init__(self,
125 cpu_bdry_index_in_floats,
126 gpu_bdry_index_in_floats,
127 face_pair_side
128 ):
129 GPUFaceStorage.__init__(self)
130 self.cpu_bdry_index_in_floats = cpu_bdry_index_in_floats
131 self.gpu_bdry_index_in_floats = gpu_bdry_index_in_floats
132 self.face_pair_side = face_pair_side
133
139
140 def first(iterable):
141 it = iter(iterable)
142 try:
143 return it.next()
144 except StopIteration:
145 return None
146
147 def bfs(top_node):
148 queue = [top_node]
149
150 result = set()
151
152 def num_connections_to_result(node):
153 return sum(1 for rn in result if node in adjgraph[rn])
154
155 from pytools import argmax2
156
157 while True:
158 curr_node_idx = argmax2((i, num_connections_to_result(qn))
159 for i, qn in enumerate(queue))
160
161 curr_node = queue.pop(curr_node_idx)
162
163 if curr_node in avail_nodes:
164 avail_nodes.remove(curr_node)
165 result.add(curr_node)
166 if len(result) == max_block_size:
167 return result, first(node for node in queue if node in avail_nodes)
168
169 queue.extend(adjgraph[curr_node])
170
171 if not queue:
172
173 if avail_nodes:
174 queue.append(iter(avail_nodes).next())
175 else:
176 return result, None
177
178 avail_nodes = set(adjgraph.iterkeys())
179 next_node = None
180
181 partition = [0]*len(adjgraph)
182
183 blocks = []
184 while avail_nodes:
185 if next_node is None:
186 from pytools import argmax2
187 next_node = argmax2((node, len(adjgraph[node])) for node in avail_nodes)
188
189 block, next_node = list(bfs(next_node))
190
191 for el in block:
192 partition[el] = len(blocks)
193 blocks.append(block)
194
195 return partition, blocks
196
201 from pymetis import part_graph
202
203 orig_part_count = part_count = len(adjgraph)//max_block_size+1
204
205 attempt_count = 5
206 for attempt in range(attempt_count):
207 if part_count > 1:
208 cuts, partition = part_graph(part_count,
209 adjgraph, vweights=[1000]*len(adjgraph))
210 else:
211
212
213 partition = [0]*len(adjgraph)
214
215 blocks = [[] for i in range(part_count)]
216 for el_id, block in enumerate(partition):
217 blocks[block].append(el_id)
218 block_elements = max(len(block_els) for block_els in blocks)
219
220 if block_elements <= max_block_size:
221 return partition, blocks
222
223 part_count += min(5, int(part_count*0.01))
224
225 from warnings import warn
226
227 warn("could not achieve Metis partition after %d attempts, falling back to greedy"
228 % attempt_count)
229
230 return make_gpu_partition_greedy(adjgraph, max_block_size)
231
232
233
234
235
236
237
238
239
240
241
242
243
244 -class Discretization(hedge.discretization.Discretization):
245 from hedge.backends.cuda.execute import ExecutionMapper \
246 as exec_mapper_class
247 from hedge.backends.cuda.execute import Executor \
248 as executor_class
249
250 @classmethod
252 return hedge.discretization.Discretization.all_debug_flags() | set([
253 "cuda_ilist_generation",
254 "cuda_compare",
255 "cuda_diff",
256 "cuda_diff_plan",
257 "cuda_flux",
258 "cuda_lift",
259 "cuda_gather_plan",
260 "cuda_lift_plan",
261 "cuda_debugbuf",
262 "cuda_memory",
263 "cuda_dump_kernels",
264 "cuda_fastbench",
265 "cuda_no_microblock",
266 "cuda_no_smem_matrix",
267 "cuda_no_plan",
268 "cuda_no_plan_el_local",
269 "cuda_keep_kernels",
270 "cuda_try_no_microblock",
271 "cuda_plan_log",
272 "cuda_plan_no_progress",
273 ])
274
277
279 try:
280 return self.partition_cache[max_block_size]
281 except KeyError:
282 pass
283
284 try:
285 import pymetis
286 metis_available = True
287 except ImportError:
288 metis_available = False
289
290 if max_block_size >= 10 and metis_available:
291 partition, blocks = make_gpu_partition_metis(
292 self.mesh.element_adjacency_graph(),
293 max_block_size)
294 else:
295 partition, blocks = make_gpu_partition_greedy(
296 self.mesh.element_adjacency_graph(),
297 max_block_size)
298
299
300 block2extifaces = dict((i, 0) for i in range(len(blocks)))
301
302 for (e1, f1), (e2, f2) in self.mesh.both_interfaces():
303 b1 = partition[e1.id]
304 b2 = partition[e2.id]
305
306 if b1 != b2:
307 block2extifaces[b1] += 1
308
309 for el, face_nbr in self.mesh.tag_to_boundary[hedge.mesh.TAG_REALLY_ALL]:
310 b1 = partition[el.id]
311 block2extifaces[b1] += 1
312
313 eg, = self.element_groups
314
315 max_facepairs = 0
316 int_face_pair_count = 0
317 face_pair_count = 0
318
319 for b in range(len(blocks)):
320 b_ext_faces = block2extifaces[b]
321 b_int_faces = (len(blocks[b])*eg.local_discretization.face_count()
322 - b_ext_faces)
323 assert b_int_faces % 2 == 0
324 b_facepairs = b_int_faces//2 + b_ext_faces
325
326 int_face_pair_count += b_int_faces//2
327 max_facepairs = max(max_facepairs, b_facepairs)
328 face_pair_count += b_facepairs
329
330 from pytools import average
331
332 result = self.PartitionData(
333 partition=partition,
334 blocks=blocks,
335 max_face_pair_count=max_facepairs,
336 ext_face_avg=average(
337 block2extifaces.itervalues()),
338 int_face_pair_avg=int_face_pair_count/len(blocks),
339 face_pair_avg=face_pair_count/len(blocks),
340 )
341
342 self.partition_cache[max_block_size] = result
343 return result
344
345 - def __init__(self, mesh, local_discretization=None,
346 order=None, init_cuda=True, debug=set(),
347 device=None, default_scalar_type=numpy.float32,
348 tune_for=None, run_context=None,
349 mpi_cuda_dev_filter=lambda dev: True):
350 """
351
352 @arg tune_for: An optemplate for whose application this discretization's
353 flux plan will be tuned.
354 """
355
356 if tune_for is None:
357 from warnings import warn
358 warn("You can achieve better performance if you pass an optemplate "
359 "in the tune_for= kwarg.")
360
361
362 ldis = self.get_local_discretization(mesh, local_discretization, order)
363
364 hedge.discretization.Discretization.__init__(self, mesh, ldis, debug=debug,
365 default_scalar_type=default_scalar_type)
366
367
368 if init_cuda:
369 cuda.init()
370
371 if device is None:
372 if run_context is None:
373 from pycuda.tools import get_default_device
374 device = get_default_device()
375 else:
376 from hedge.backends.cuda.tools import mpi_get_default_device
377 device = mpi_get_default_device(run_context.communicator,
378 dev_filter=mpi_cuda_dev_filter)
379
380 if isinstance(device, int):
381 device = cuda.Device(device)
382 if init_cuda:
383 self.cuda_context = device.make_context()
384 else:
385 self.cuda_context = None
386
387 self.device = device
388 from pycuda.tools import DeviceData
389
390
391 if "cuda_memory" in self.debug:
392 from pycuda.tools import DebugMemoryPool
393 if run_context is not None and run_context.ranks > 1:
394 self.pool = DebugMemoryPool(
395 interactive=False,
396 logfile=open("rank-%d-mem.log" % run_context.rank, "w")
397 )
398 else:
399 self.pool = DebugMemoryPool(
400 interactive=False,
401 logfile=open("mem.log", "w"))
402 else:
403 from pycuda.tools import DeviceMemoryPool
404 self.pool = DeviceMemoryPool()
405
406 from pycuda.tools import PageLockedMemoryPool
407 self.pagelocked_pool = PageLockedMemoryPool()
408
409
410 self.partition_cache = {}
411
412 from pytools import Record
413 class OverallPlan(Record):pass
414
415 def generate_overall_plans():
416 if "cuda_no_microblock" in self.debug:
417 allow_mb_values = [False]
418 elif "cuda_try_no_microblock" in self.debug:
419 allow_mb_values = [True, False]
420 else:
421 allow_mb_values = [True]
422
423 for allow_mb in allow_mb_values:
424 from hedge.backends.cuda.plan import PlanGivenData
425 given = PlanGivenData(
426 DeviceData(device), ldis,
427 allow_microblocking=allow_mb,
428 float_type=default_scalar_type)
429
430 import hedge.backends.cuda.fluxgather as fluxgather
431 flux_plan, flux_time = fluxgather.make_plan(self, given, tune_for)
432
433
434 pdata = self._get_partition_data(
435 flux_plan.elements_per_block())
436 given.post_decomposition(
437 block_count=len(pdata.blocks),
438 microblocks_per_block=flux_plan.microblocks_per_block())
439
440
441 from hedge.backends.cuda.plan import \
442 make_diff_plan, \
443 make_element_local_plan
444
445 diff_plan, diff_time = make_diff_plan(self, given)
446 fluxlocal_plan, fluxlocal_time = make_element_local_plan(
447 self, given,
448 op_name="lift",
449 aligned_preimage_dofs_per_microblock=
450 given.aligned_face_dofs_per_microblock(),
451 preimage_dofs_per_el=given.face_dofs_per_el(),
452 with_index_check=False)
453
454 sys_size = flux_plan.flux_count
455 total_time = flux_time + sys_size*(diff_time+fluxlocal_time)
456
457 yield OverallPlan(
458 given=given,
459 flux_plan=flux_plan,
460 partition=pdata.partition,
461 diff_plan=diff_plan,
462 fluxlocal_plan=fluxlocal_plan), total_time
463
464 overall_plans = list(generate_overall_plans())
465
466 for op, total_time in overall_plans:
467 print "microblocking=%s -> time=%g" % (
468 op.given.microblock.elements != 1, total_time)
469
470 from pytools import argmin2
471 overall_plan = argmin2(overall_plans)
472
473 self.given = overall_plan.given
474 self.flux_plan = overall_plan.flux_plan
475 self.partition = overall_plan.partition
476 self.diff_plan = overall_plan.diff_plan
477 self.fluxlocal_plan = overall_plan.fluxlocal_plan
478
479 print "actual flux exec plan:", self.flux_plan
480 print "actual diff op exec plan:", self.diff_plan
481 print "actual flux local exec plan:", self.fluxlocal_plan
482
483
484 self.blocks = self._build_blocks()
485 self.face_storage_map = self._build_face_storage_map()
486
487
488 if "cuda_compare" in self.debug:
489 from hedge.backends.jit import Discretization
490 self.test_discr = Discretization(mesh, ldis)
491
492 self.stream_pool = []
493
495 self.pool.stop_holding()
496 self.pagelocked_pool.stop_holding()
497 if self.cuda_context is not None:
498 self.cuda_context.pop()
499
501 block_el_numbers = {}
502 for el_id, block in enumerate(self.partition):
503 block_el_numbers.setdefault(block, []).append(el_id)
504
505 block_count = len(block_el_numbers)
506
507 def make_block(block_num):
508 given = self.given
509
510 microblocks = []
511 current_microblock = []
512 el_offsets_list = []
513 el_number_map = {}
514 elements = [self.mesh.elements[ben]
515 for ben in block_el_numbers.get(block_num, [])]
516 for block_el_nr, el in enumerate(elements):
517 el_offset = (
518 len(microblocks)*given.microblock.aligned_floats
519 + len(current_microblock)*given.dofs_per_el())
520 el_number_map[el] = block_el_nr
521 el_offsets_list.append(el_offset)
522
523 current_microblock.append(el)
524 if len(current_microblock) == given.microblock.elements:
525 microblocks.append(current_microblock)
526 current_microblock = []
527
528 if current_microblock:
529 microblocks.append(current_microblock)
530
531 assert len(microblocks) <= self.flux_plan.microblocks_per_block()
532
533 eg, = self.element_groups
534 return GPUBlock(block_num,
535 local_discretization=eg.local_discretization,
536 cpu_slices=[self.find_el_range(el.id) for el in elements],
537 microblocks=microblocks,
538 el_offsets_list=el_offsets_list,
539 el_number_map=el_number_map)
540
541 return [make_block(block_num) for block_num in range(block_count)]
542
543
544
545
575
576 int_fg, = self.face_groups
577 ldis = int_fg.ldis_loc
578 assert ldis == int_fg.ldis_opp
579
580 id_face_index_list_number = fil_registry.register(
581 None,
582 lambda: tuple(xrange(ldis.face_node_count()))
583 )
584 assert id_face_index_list_number == 0
585
586 for fp in int_fg.face_pairs:
587 face1 = make_int_face(fp.loc)
588 face2 = make_int_face(fp.opp)
589 face1.opposite = face2
590 face2.opposite = face1
591
592 def apply_write_map(wmap, sequence):
593 result = [None] * len(sequence)
594 for wm_i, seq_i in zip(wmap, sequence):
595 result[wm_i] = seq_i
596 assert None not in result
597 return tuple(result)
598
599 f_ind = ldis.face_indices()
600
601 def face1_in_el_ilist():
602 return tuple(int_fg.index_lists[
603 fp.loc.face_index_list_number])
604
605 def face2_in_el_ilist():
606 return tuple(int_fg.index_lists[
607 fp.opp.face_index_list_number])
608
609 def opp_write_map():
610 return tuple(
611 int_fg.index_lists[fp.opp_native_write_map])
612
613 face1.global_int_flux_index_list_id = fil_registry.register(
614 (int_fg, fp.loc.face_index_list_number),
615 face1_in_el_ilist)
616 face1.global_ext_flux_index_list_id = fil_registry.register(
617 (int_fg, fp.opp.face_index_list_number),
618 face2_in_el_ilist)
619
620 face2.global_int_flux_index_list_id = fil_registry.register(
621 (int_fg, fp.opp_native_write_map,
622 fp.opp.face_index_list_number),
623 lambda: apply_write_map(
624 opp_write_map(), face2_in_el_ilist())
625 )
626 face2.global_ext_flux_index_list_id = fil_registry.register(
627 (int_fg, fp.opp_native_write_map,
628 fp.loc.face_index_list_number),
629 lambda: apply_write_map(
630 opp_write_map(), face1_in_el_ilist())
631 )
632
633 from pytools import get_write_to_map_from_permutation as gwtm
634
635 face1.opp_write_index_list_id = fil_registry.register(
636 (int_fg, "wtm", fp.opp.face_index_list_number,
637 fp.opp.face_id),
638 lambda: gwtm(face2_in_el_ilist(), f_ind[fp.opp.face_id])
639 )
640 face2.opp_write_index_list_id = fil_registry.register(
641 (int_fg, "wtm",
642 fp.opp_native_write_map,
643 fp.loc.face_index_list_number,
644 fp.loc.face_id),
645 lambda: gwtm(
646 apply_write_map(opp_write_map(), face1_in_el_ilist()),
647 f_ind[fp.loc.face_id])
648 )
649
650 self.aligned_boundary_floats = 0
651 from hedge.mesh import TAG_REALLY_ALL
652
653 for bdry_fg in self.get_boundary(TAG_REALLY_ALL).face_groups:
654 if bdry_fg.ldis_loc is None:
655 assert len(bdry_fg.face_pairs) == 0
656 continue
657
658 assert ldis == bdry_fg.ldis_loc
659
660 aligned_fnc = self.given.devdata.align_dtype(ldis.face_node_count(),
661 self.given.float_size())
662 for fp in bdry_fg.face_pairs:
663 assert fp.opp.element_id == hedge._internal.INVALID_ELEMENT
664
665
666
667 face1 = make_int_face(fp.loc)
668 face2 = GPUBoundaryFaceStorage(
669 fp.opp.el_base_index,
670 self.aligned_boundary_floats,
671 fp.opp
672 )
673 self.aligned_boundary_floats += aligned_fnc
674 face1.opposite = face2
675 face2.opposite = face1
676
677 face1.global_int_flux_index_list_id = fil_registry.register(
678 (bdry_fg,fp.loc.face_index_list_number),
679 lambda: tuple(bdry_fg.index_lists[
680 fp.loc.face_index_list_number])
681 )
682 face1.global_ext_flux_index_list_id = fil_registry.register(
683 (bdry_fg, fp.opp.face_index_list_number),
684 lambda: tuple(bdry_fg.index_lists[
685 fp.opp.face_index_list_number])
686 )
687
688 self.index_lists = fil_registry.index_lists
689 return fsm
690
691
692
693
694
695
696
698 if not self.stream_pool:
699 return cuda.Stream()
700 else:
701 return self.stream_pool.pop()
702
704 self.stream_pool.append(s)
705
706
707
708
709
711 mgr.set_constant("flux_plan", str(self.flux_plan))
712 mgr.set_constant("diff_plan", str(self.diff_plan))
713 mgr.set_constant("fluxlocal_plan", str(self.fluxlocal_plan))
714
715 from pytools.log import EventCounter
716
717 self.gmem_bytes_gather = EventCounter("gmem_bytes_gather",
718 "Bytes of gmem traffic during gather")
719 self.gmem_bytes_el_local = EventCounter("gmem_bytes_el_local",
720 "Bytes of gmem traffic during element-local matrix application")
721 self.gmem_bytes_diff = EventCounter("gmem_bytes_diff",
722 "Bytes of gmem traffic during lift")
723 self.gmem_bytes_vector_math = EventCounter("gmem_bytes_vector_math",
724 "Bytes of gmem traffic during vector math")
725 self.gmem_bytes_rk4 = EventCounter("gmem_bytes_rk4",
726 "Bytes of gmem traffic during RK4")
727
728 mgr.add_quantity(self.gmem_bytes_gather)
729 mgr.add_quantity(self.gmem_bytes_el_local)
730 mgr.add_quantity(self.gmem_bytes_diff)
731 mgr.add_quantity(self.gmem_bytes_vector_math)
732 mgr.add_quantity(self.gmem_bytes_rk4)
733
734 hedge.discretization.Discretization.add_instrumentation(self, mgr)
735
752
753
754
755
756
758 given = self.given
759 block = self.blocks[self.partition[el.id]]
760
761 mb_nr, in_mb_nr = divmod(block.el_number_map[el], given.microblock.elements)
762
763 return (block.number * self.flux_plan.dofs_per_block()
764 + mb_nr*given.microblock.aligned_floats
765 + in_mb_nr*given.dofs_per_el())
766
768 block = self.blocks[self.partition[el.id]]
769 return block.el_number_map[el]
770
771 @memoize_method
781
782 @memoize_method
784 result = numpy.zeros((len(self.nodes),), dtype=numpy.intp)
785 block_offset = 0
786 block_size = self.flux_plan.dofs_per_block()
787 for block in self.blocks:
788 el_length = block.local_discretization.node_count()
789
790 for el_offset, cpu_slice in zip(
791 block.el_offsets_list, block.cpu_slices):
792 result[cpu_slice] = \
793 block_offset+el_offset+numpy.arange(el_length)
794
795 block_offset += block_size
796
797 assert (result <= self.gpu_dof_count()).all()
798
799 return result
800
801 @memoize_method
803 return gpuarray.to_gpu(
804 numpy.asarray(
805 numpy.sort(self._gpu_volume_embedding()),
806 dtype=numpy.uint32),
807 allocator=self.pool.allocate)
808
810 def f(subfld):
811 cpu_transfer = self.pagelocked_pool.allocate(
812 (self.gpu_dof_count(),), dtype=subfld.dtype)
813
814 cpu_transfer[self._gpu_volume_embedding()] = subfld
815 return gpuarray.to_gpu(cpu_transfer, allocator=self.pool.allocate)
816
817 from hedge.tools import with_object_array_or_scalar
818 return with_object_array_or_scalar(f, field)
819
823
824 from hedge.tools import with_object_array_or_scalar
825 return with_object_array_or_scalar(f, field)
826
827 @memoize_method
829 """Return an array of indices embedding a CPU boundary
830 field for C{tag} into the GPU boundary field."""
831
832 bdry = self.get_boundary(tag)
833 result = numpy.empty(
834 (len(bdry.nodes),),
835 dtype=numpy.intp)
836 result.fill(-1)
837
838 cpu_base = 0
839 for elface in self.mesh.tag_to_boundary.get(tag, []):
840 face_stor = self.face_storage_map[elface]
841 bdry_stor = face_stor.opposite
842 assert isinstance(bdry_stor, GPUBoundaryFaceStorage)
843
844 face_len = (bdry_stor.opposite.native_block
845 .local_discretization.face_node_count())
846 gpu_base = bdry_stor.gpu_bdry_index_in_floats
847 result[cpu_base:cpu_base+face_len] = \
848 numpy.arange(gpu_base, gpu_base+face_len)
849 cpu_base += face_len
850
851 assert (result>=0).all()
852 return result
853
854 @memoize_method
860
862 - def __init__(self, discr, field, tag, read_map=None):
863 self.discr = discr
864
865 from hedge.tools import log_shape
866
867 ls = log_shape(field)
868 if ls == ():
869 field_list = [field]
870 n = 1
871 else:
872 field_list = field
873 n, = ls
874
875 one_field = field_list[0]
876 one_field_size = len(one_field)
877
878 if field.dtype == object:
879 self.buf = buf = discr.pagelocked_pool.allocate(
880 ls+one_field.shape, dtype=self.default_scalar_type)
881 for i, subf in enumerate(field):
882 buf[i, :] = subf
883 else:
884 assert field.flags.c_contiguous
885 buf = field
886
887 self.stream = discr._get_stream()
888
889 buf.shape = buf.size,
890 try:
891 self.buf_gpu = buf_gpu = gpuarray.to_gpu_async(
892 buf, discr.pool.allocate, self.stream)
893 except cuda.LogicError:
894
895 self.buf_gpu = buf_gpu = gpuarray.to_gpu(buf, discr.pool.allocate)
896
897 from hedge.tools import make_obj_array
898 out = make_obj_array([
899 discr.boundary_empty(tag) for i in range(n)])
900
901 if one_field_size:
902 if read_map is None:
903 gpuarray.multi_put(
904 arrays=[
905 buf_gpu[i*one_field_size:(i+1)*one_field_size]
906 for i in range(n)],
907 dest_indices=discr._gpu_boundary_embedding_on_gpu(tag),
908 out=out, stream=self.stream)
909 else:
910 gpuarray.multi_take_put(
911 arrays=[buf_gpu for i in range(n)],
912 dest_indices=discr._gpu_boundary_embedding_on_gpu(tag),
913 src_indices=read_map,
914 src_offsets=[i*one_field_size for i in range(n)],
915 out=out, stream=self.stream)
916
917 if ls == ():
918 self.result = out[0]
919 else:
920 self.result = out
921
923 return self.stream.is_done()
924
926 self.stream.synchronize()
927 self.discr._release_stream(self.stream)
928 return self.result
929
933
934 from hedge.tools import with_object_array_or_scalar
935 return with_object_array_or_scalar(f, field)
936
947
958
967
968
970 return gpuarray.empty(shape, dtype=dtype,
971 allocator=self.pool.allocate)
972
974 result = gpuarray.empty(shape, dtype=dtype,
975 allocator=self.pool.allocate)
976 result.fill(0)
977 return result
978
979 - def _new_vec(self, shape, create_func, dtype, base_size):
980 if dtype is None:
981 dtype = self.default_scalar_type
982
983 if shape == ():
984 return create_func((base_size,), dtype=dtype)
985
986 result = numpy.empty(shape, dtype=object)
987 from pytools import indices_in_shape
988 for i in indices_in_shape(shape):
989 result[i] = create_func((base_size,), dtype=dtype)
990 return result
991
992
993
994 compute_kind = "gpu"
995
997 if isinstance(field, gpuarray.GPUArray):
998 return "gpu"
999
1000 from hedge.tools import log_shape
1001 from pytools import indices_in_shape
1002
1003 first_field = field[iter(indices_in_shape(log_shape(field))).next()]
1004 if isinstance(first_field, numpy.ndarray):
1005 return "numpy"
1006 elif isinstance(first_field, gpuarray.GPUArray):
1007 return "gpu"
1008 else:
1009 raise TypeError, "invalid field kind"
1010
1018
1026
1038
1039
1053
1060
1061 @memoize_method
1063 from_indices = []
1064 to_indices = []
1065
1066 for elface in self.mesh.tag_to_boundary.get(tag, []):
1067 vol_face = self.face_storage_map[elface]
1068 bdry_face = vol_face.opposite
1069 assert isinstance(bdry_face, GPUBoundaryFaceStorage)
1070
1071 vol_el_index = \
1072 self.find_el_gpu_index(vol_face.el_face[0])
1073 native_ilist = self.index_lists[vol_face.native_index_list_id]
1074 from_indices.extend(vol_el_index+i for i in native_ilist)
1075 bdry_index = bdry_face.gpu_bdry_index_in_floats
1076 to_indices.extend(
1077 xrange(bdry_index, bdry_index+len(native_ilist)))
1078
1079 return from_indices, to_indices
1080
1081 @memoize_method
1083 from_indices, to_indices = self._boundarize_info(tag)
1084 return (
1085 gpuarray.to_gpu(
1086 numpy.array(from_indices, dtype=numpy.uint32)),
1087 gpuarray.to_gpu(
1088 numpy.array(to_indices, dtype=numpy.uint32)),
1089 )
1090
1091 @memoize_method
1093 from_indices, to_indices = self._boundarize_info(tag)
1094
1095 temp_tgt = numpy.zeros((self.aligned_boundary_floats,), dtype=numpy.int32)
1096 temp_tgt[to_indices] = from_indices
1097
1098 result = temp_tgt[self._gpu_boundary_embedding(tag)]
1099
1100 return gpuarray.to_gpu(result)
1101
1103 - def __init__(self, discr, field, log_shape, tag):
1104 self.discr = discr
1105
1106 base_size = len(discr.get_boundary(tag).nodes)
1107 self.result = result = discr.pagelocked_pool.allocate(
1108 log_shape+(base_size,),
1109 dtype=discr.default_scalar_type)
1110 self.result_gpu = gpuarray.empty((result.size,), result.dtype,
1111 allocator=discr.pool.allocate)
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122 self.stream = discr._get_stream()
1123 gpuarray.multi_take(field, discr._numpy_boundarize_info(tag),
1124 [self.result_gpu[i*base_size:(i+1)*base_size] for i in range(len(field))],
1125 stream=self.stream)
1126 self.result_gpu.get_async(self.stream, result)
1127
1129 return self.stream.is_done()
1130
1132 self.stream.synchronize()
1133 self.discr._release_stream(self.stream)
1134 return self.result
1135
1137 if self.get_kind(field) == self.compute_kind:
1138 from hedge.tools import log_shape
1139 ls = log_shape(field)
1140
1141 if kind is None or kind == self.compute_kind:
1142
1143
1144 from_indices, to_indices = \
1145 self._gpu_boundarize_info(tag)
1146 kind = None
1147
1148 from hedge.mesh import TAG_ALL
1149 if tag != TAG_ALL:
1150 make_new = self.boundary_zeros
1151 else:
1152 make_new = self.boundary_empty
1153
1154 if ls != ():
1155 from pytools import single_valued
1156 out = result = make_new(tag, shape=ls,
1157 dtype=single_valued(f.dtype for f in field))
1158 src = field
1159 else:
1160 result = make_new(tag, dtype=field.dtype)
1161 out = [result]
1162 src = [field]
1163
1164 gpuarray.multi_take_put(src,
1165 to_indices, from_indices, out=out)
1166 if kind is None:
1167 return result
1168 else:
1169 return self.convert_boundary(result, tag, kind)
1170
1171 elif kind == "numpy":
1172
1173
1174 if ls == ():
1175 field = [field]
1176
1177 return self._BoundarizeGPUToNumpyFuture(self, field, ls, tag)()
1178 else:
1179 raise ValueError("invalid target boundary kind: %s" % kind)
1180 else:
1181 return hedge.discretization.Discretization.boundarize_volume_field(
1182 self, field, tag, kind)
1183
1195
1197 return gpuarray.to_gpu(numpy.asarray(indices, dtype=numpy.uint32))
1198
1199
1200 @memoize_method
1211
1212
1217
1218
1219 @memoize_method
1221 """For a given L{hedge.discretization._ElementGroup} instance
1222 C{elgroup}, return an index array (of dtype C{numpy.intp}) that,
1223 indexed by the block-microblock element number, gives the element
1224 number within C{elgroup}.
1225 """
1226
1227 def get_el_index_in_el_group(el):
1228 mygroup, idx = self.group_map[el.id]
1229 assert mygroup is elgroup
1230 return idx
1231
1232 given = self.given
1233
1234 el_count = len(self.blocks) * given.elements_per_block()
1235 elgroup_indices = numpy.zeros((el_count,), dtype=numpy.intp)
1236
1237 for block in self.blocks:
1238 block_elgroup_indices = [ get_el_index_in_el_group(el)
1239 for mb in block.microblocks
1240 for el in mb]
1241 offset = block.number * given.elements_per_block()
1242 elgroup_indices[offset:offset+len(block_elgroup_indices)] = \
1243 block_elgroup_indices
1244
1245 return elgroup_indices
1246
1251 result = discr.volume_zeros(kind="numpy")
1252 for block in discr.blocks:
1253 for cpu_slice in block.cpu_slices:
1254 result[cpu_slice] = block.number
1255
1256 return result
1257