1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24
25 import pycuda.driver as cuda
26
27
28
29
30
32 quot, rem = divmod(dividend, divisor)
33 assert rem == 0
34 return quot
35
37 """Round C{value} up to be a C{multiple_of} something."""
38
39
40 from math import ceil
41 return int(ceil(value/multiple_of))*multiple_of
42
44 """Round C{value} down to be a C{multiple_of} something."""
45
46
47 from math import floor
48 return int(floor(value/multiple_of))*multiple_of
49
51 from hedge.tools import log_shape
52 ls = log_shape(field)
53 if ls != ():
54 result = numpy.array(ls, dtype=object)
55
56 from pytools import indices_in_shape
57
58 for i in indices_in_shape(ls):
59 result[i] = gpuarray.to_gpu(field[i])
60 return result
61 else:
62 return gpuarray.to_gpu(field)
63
64 -def pad(s, block_size):
65 missing_bytes = block_size - len(s)
66 assert missing_bytes >= 0
67 return s + "\x00"*missing_bytes
68
70 return "".join(pad(b, block_size) for b in blocks)
71
73 from pytools import Record
74 from hedge.cuda.tools import pad_and_join
75 from pytools import Record
76
77 blocks = ["".join(b) for b in data]
78 block_size = devdata.align(max(len(b) for b in blocks))
79 return Record(
80 blocks=cuda.to_device(pad_and_join(blocks, block_size)),
81 max_per_block=max(len(b) for b in data),
82 block_size=block_size,
83 )
84
86 from pytools import Record
87 from hedge.cuda.tools import pad_and_join
88 from pytools import Record
89
90
91
92
93 multi_blocks = [
94 ["".join(s) for s in part_data]
95 for part_data, part_decls in multi_item]
96 block_sizes = [
97 max(len(b) for b in part_blocks)
98 for part_blocks in multi_blocks]
99
100 from pytools import single_valued
101 block_count = single_valued(
102 len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)
103
104 from hedge.cuda.cgen import Struct, Value, ArrayOf
105
106 struct_members = []
107 for part_data, part_decl in single_item:
108 assert block_count == len(part_data)
109 single_valued(len(block) for block in part_data)
110 struct_members.append(part_decl)
111
112 for part_data, part_decl in multi_item:
113 struct_members.append(
114 ArrayOf(part_decl, max(len(s) for s in part_data)))
115
116 superblocks = []
117 for superblock_num in range(block_count):
118 data = ""
119 for part_data, part_decl in single_item:
120 data += part_data[superblock_num]
121
122 for part_blocks, part_size in zip(multi_blocks, block_sizes):
123 assert block_count == len(part_blocks)
124 data += pad(part_blocks[superblock_num], part_size)
125
126 superblocks.append(data)
127
128 superblock_size = devdata.align(
129 single_valued(len(sb) for sb in superblocks))
130
131 data = pad_and_join(superblocks, superblock_size)
132 assert len(data) == superblock_size*block_count
133
134 return Record(
135 struct=Struct(struct_name, struct_members),
136 device_memory=cuda.to_device(data),
137 block_bytes=superblock_size,
138 data=data,
139 )
140
141
142
143
146 import pycuda.driver as drv
147
148 self.max_threads = dev.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK)
149 self.warp_size = dev.get_attribute(drv.device_attribute.WARP_SIZE)
150 self.thread_blocks_per_mp = 8
151 self.warps_per_mp = 24
152 self.registers = dev.get_attribute(drv.device_attribute.REGISTERS_PER_BLOCK)
153 self.shared_memory = dev.get_attribute(drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
154
157
161
164
166 if wordsize == 4:
167 return 64
168 elif wordsize == 8:
169 return 128
170 elif wordsize == 16:
171 return 128
172 else:
173 raise ValueError, "no alignment possible for fetches of size %d" % wordsize
174
177
178 @staticmethod
180 valid_sizes = [1,2,4]
181 for vs in valid_sizes:
182 if size <= vs:
183 return vs
184
185 raise ValueError, "could not enlarge argument to valid channel count"
186
187
188
189
191 - def __init__(self, devdata, threads, shared_mem=0, registers=0):
192 if threads > devdata.max_threads:
193 raise ValueError("too many threads")
194
195
196 alloc_warps = int_ceiling(threads/devdata.warp_size)
197 alloc_regs = int_ceiling(alloc_warps*2, 4)*16*registers
198 alloc_smem = int_ceiling(shared_mem, 512)
199
200 self.tb_per_mp_limits = [(devdata.thread_blocks_per_mp, "device"),
201 (int_floor(devdata.warps_per_mp/alloc_warps), "warps")
202 ]
203 if registers > 0:
204 self.tb_per_mp_limits.append((int_floor(devdata.registers/alloc_regs), "regs"))
205 if shared_mem > 0:
206 self.tb_per_mp_limits.append((int_floor(devdata.shared_memory/alloc_smem), "smem"))
207
208 self.tb_per_mp, self.limited_by = min(self.tb_per_mp_limits)
209
210 self.warps_per_mp = self.tb_per_mp * alloc_warps
211 self.occupancy = self.warps_per_mp / devdata.warps_per_mp
212
213
214
215
217 for threads in range(32, 512, 16):
218 for smem in range(1024, 16384+1, 1024):
219 occ = Occupancy(threads, smem)
220 print "t%d s%d: %f %s" % (threads, smem, occ.occupancy, occ.limited_by)
221
222
223
224
225 if __name__ == "__main__":
226 import pycuda.driver as drv
227 drv.init()
228
229 _test_occupancy()
230