Package hedge :: Package cuda :: Module tools
[frames] | no frames]

Source Code for Module hedge.cuda.tools

  1  """Interface with Nvidia CUDA.""" 
  2   
  3  from __future__ import division 
  4   
  5  __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" 
  6   
  7  __license__ = """ 
  8  This program is free software: you can redistribute it and/or modify 
  9  it under the terms of the GNU General Public License as published by 
 10  the Free Software Foundation, either version 3 of the License, or 
 11  (at your option) any later version. 
 12   
 13  This program is distributed in the hope that it will be useful, 
 14  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  GNU General Public License for more details. 
 17   
 18  You should have received a copy of the GNU General Public License 
 19  along with this program.  If not, see U{http://www.gnu.org/licenses/}. 
 20  """ 
 21   
 22   
 23   
 24   
 25  import pycuda.driver as cuda 
 26   
 27   
 28   
 29   
 30  # tools ----------------------------------------------------------------------- 
31 -def exact_div(dividend, divisor):
32 quot, rem = divmod(dividend, divisor) 33 assert rem == 0 34 return quot
35
36 -def int_ceiling(value, multiple_of=1):
37 """Round C{value} up to be a C{multiple_of} something.""" 38 # Mimicks the Excel "floor" function (for code stolen from occupany calculator) 39 40 from math import ceil 41 return int(ceil(value/multiple_of))*multiple_of
42
43 -def int_floor(value, multiple_of=1):
44 """Round C{value} down to be a C{multiple_of} something.""" 45 # Mimicks the Excel "floor" function (for code stolen from occupany calculator) 46 47 from math import floor 48 return int(floor(value/multiple_of))*multiple_of
49
50 -def vec_to_gpu(field):
51 from hedge.tools import log_shape 52 ls = log_shape(field) 53 if ls != (): 54 result = numpy.array(ls, dtype=object) 55 56 from pytools import indices_in_shape 57 58 for i in indices_in_shape(ls): 59 result[i] = gpuarray.to_gpu(field[i]) 60 return result 61 else: 62 return gpuarray.to_gpu(field)
63
64 -def pad(s, block_size):
65 missing_bytes = block_size - len(s) 66 assert missing_bytes >= 0 67 return s + "\x00"*missing_bytes
68
69 -def pad_and_join(blocks, block_size):
70 return "".join(pad(b, block_size) for b in blocks)
71
72 -def make_blocks(devdata, data):
73 from pytools import Record 74 from hedge.cuda.tools import pad_and_join 75 from pytools import Record 76 77 blocks = ["".join(b) for b in data] 78 block_size = devdata.align(max(len(b) for b in blocks)) 79 return Record( 80 blocks=cuda.to_device(pad_and_join(blocks, block_size)), 81 max_per_block=max(len(b) for b in data), 82 block_size=block_size, 83 )
84
85 -def make_superblocks(devdata, struct_name, single_item, multi_item):
86 from pytools import Record 87 from hedge.cuda.tools import pad_and_join 88 from pytools import Record 89 90 # single_item = [([ block1, block2, ... ], decl), ...] 91 # multi_item = [([ [ item1, item2, ...], ... ], decl), ...] 92 93 multi_blocks = [ 94 ["".join(s) for s in part_data] 95 for part_data, part_decls in multi_item] 96 block_sizes = [ 97 max(len(b) for b in part_blocks) 98 for part_blocks in multi_blocks] 99 100 from pytools import single_valued 101 block_count = single_valued( 102 len(si_part_blocks) for si_part_blocks, si_part_decl in single_item) 103 104 from hedge.cuda.cgen import Struct, Value, ArrayOf 105 106 struct_members = [] 107 for part_data, part_decl in single_item: 108 assert block_count == len(part_data) 109 single_valued(len(block) for block in part_data) 110 struct_members.append(part_decl) 111 112 for part_data, part_decl in multi_item: 113 struct_members.append( 114 ArrayOf(part_decl, max(len(s) for s in part_data))) 115 116 superblocks = [] 117 for superblock_num in range(block_count): 118 data = "" 119 for part_data, part_decl in single_item: 120 data += part_data[superblock_num] 121 122 for part_blocks, part_size in zip(multi_blocks, block_sizes): 123 assert block_count == len(part_blocks) 124 data += pad(part_blocks[superblock_num], part_size) 125 126 superblocks.append(data) 127 128 superblock_size = devdata.align( 129 single_valued(len(sb) for sb in superblocks)) 130 131 data = pad_and_join(superblocks, superblock_size) 132 assert len(data) == superblock_size*block_count 133 134 return Record( 135 struct=Struct(struct_name, struct_members), 136 device_memory=cuda.to_device(data), 137 block_bytes=superblock_size, 138 data=data, 139 )
140 141 142 143 # knowledge about hardware ----------------------------------------------------
144 -class DeviceData:
145 - def __init__(self, dev):
146 import pycuda.driver as drv 147 148 self.max_threads = dev.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK) 149 self.warp_size = dev.get_attribute(drv.device_attribute.WARP_SIZE) 150 self.thread_blocks_per_mp = 8 151 self.warps_per_mp = 24 152 self.registers = dev.get_attribute(drv.device_attribute.REGISTERS_PER_BLOCK) 153 self.shared_memory = dev.get_attribute(drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
154
155 - def align(self, bytes):
156 return int_ceiling(bytes, self.align_bytes())
157
158 - def align_dtype(self, elements, dtype_size):
159 return int_ceiling(elements, 160 self.align_words(dtype_size))
161
162 - def align_words(self, word_size):
163 return exact_div(self.align_bytes(word_size), word_size)
164
165 - def align_bytes(self, wordsize=4):
166 if wordsize == 4: 167 return 64 168 elif wordsize == 8: 169 return 128 170 elif wordsize == 16: 171 return 128 172 else: 173 raise ValueError, "no alignment possible for fetches of size %d" % wordsize
174
175 - def coalesce(self, thread_count):
176 return int_ceiling(thread_count, 16)
177 178 @staticmethod
180 valid_sizes = [1,2,4] 181 for vs in valid_sizes: 182 if size <= vs: 183 return vs 184 185 raise ValueError, "could not enlarge argument to valid channel count"
186 187 188 189
190 -class OccupancyRecord:
191 - def __init__(self, devdata, threads, shared_mem=0, registers=0):
192 if threads > devdata.max_threads: 193 raise ValueError("too many threads") 194 195 # copied literally from occupancy calculator 196 alloc_warps = int_ceiling(threads/devdata.warp_size) 197 alloc_regs = int_ceiling(alloc_warps*2, 4)*16*registers 198 alloc_smem = int_ceiling(shared_mem, 512) 199 200 self.tb_per_mp_limits = [(devdata.thread_blocks_per_mp, "device"), 201 (int_floor(devdata.warps_per_mp/alloc_warps), "warps") 202 ] 203 if registers > 0: 204 self.tb_per_mp_limits.append((int_floor(devdata.registers/alloc_regs), "regs")) 205 if shared_mem > 0: 206 self.tb_per_mp_limits.append((int_floor(devdata.shared_memory/alloc_smem), "smem")) 207 208 self.tb_per_mp, self.limited_by = min(self.tb_per_mp_limits) 209 210 self.warps_per_mp = self.tb_per_mp * alloc_warps 211 self.occupancy = self.warps_per_mp / devdata.warps_per_mp
212 213 214 215
216 -def _test_occupancy():
217 for threads in range(32, 512, 16): 218 for smem in range(1024, 16384+1, 1024): 219 occ = Occupancy(threads, smem) 220 print "t%d s%d: %f %s" % (threads, smem, occ.occupancy, occ.limited_by)
221 222 223 224 225 if __name__ == "__main__": 226 import pycuda.driver as drv 227 drv.init() 228 229 _test_occupancy() 230