1
+
2
+ __author__ = 'hervemn'
3
+ import numpy as np
4
+ import pycuda .tools
5
+ from pycuda import characterize
6
+ import pycuda .driver as cuda
7
+ import pycuda .compiler
8
+ from pycuda import gpuarray as ga
9
+ cuda .init ()
10
+ num_gpu = cuda .Device .count ()
11
+ for i in range (0 ,num_gpu ):
12
+ tmp_dev = cuda .Device (i )
13
+ print "device_id = " ,i ,tmp_dev .name ()
14
+ id_gpu = raw_input ("Select GPU: " )
15
+ id_gpu = int (id_gpu )
16
+ curr_gpu = cuda .Device (id_gpu )
17
+ print "you have selected " , curr_gpu .name ()
18
+ kernels_cuda_src = """
19
+ #include <curand_kernel.h>
20
+
21
+ extern "C"
22
+ {
23
+
24
+ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
25
+ {
26
+ int id = blockIdx.x*blockDim.x + threadIdx.x;
27
+
28
+ if (id >= nthreads)
29
+ return;
30
+ curand_init(seed, id, offset, &s[id]);
31
+ }
32
+
33
+
34
+ __global__ void gen_rand_mat(int* initArray, float *randArray, curandState *state, int n_rng, int width,
35
+ float fact_sub)
36
+ {
37
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
38
+ int r1 = threadIdx.y + blockDim.y * blockIdx.y;
39
+ int coord = r0 * width + r1;
40
+ int id_rng = coord % n_rng ;
41
+ if ((r0<width) && (r1< width)) {
42
+ float mean = initArray[coord] * fact_sub;
43
+ randArray[coord] = curand_poisson(&state[id_rng], mean);
44
+ }
45
+ }
46
+
47
+ __global__ void copy_mat(int* initArray, float *randArray, int width, float fact_sub)
48
+ {
49
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
50
+ int r1 = threadIdx.y + blockDim.y * blockIdx.y;
51
+ int coord = r0 * width + r1;
52
+ if ((r0<width) && (r1< width)) {
53
+ float mean = initArray[coord] * fact_sub;
54
+ randArray[coord] = mean;
55
+ }
56
+ }
57
+
58
+ __global__ void sum_along_axis(float* input, float* vect_sum, int width , int axis)
59
+ {
60
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
61
+ int r1 = threadIdx.y + blockDim.y * blockIdx.y;
62
+ int coord = r0 * width + r1;
63
+ if ((r0<width) && (r1< width)) {
64
+ float val = input[coord];
65
+ int id = (axis == 0) * r0 + (axis == 1) * r1;
66
+ val = atomicAdd( &(vect_sum[id]), (float) val);
67
+ }
68
+ }
69
+
70
+ __global__ void norm_along_axis(float* input, float* vect_sum, int width, int axis)
71
+ {
72
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
73
+ int r1 = threadIdx.y + blockDim.y * blockIdx.y;
74
+ int coord = r0 * width + r1;
75
+ if ((r0<width) && (r1< width)) {
76
+ float val = input[coord];
77
+ int id = (axis == 0) * r0 + (axis == 1) * r1;
78
+ input[coord] = val / vect_sum[id];
79
+ }
80
+ }
81
+ __global__ void init_vect_sum(float* vect_sum, int width)
82
+ {
83
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
84
+ if (r0 < width){
85
+ vect_sum[r0] = 0;
86
+ }
87
+ }
88
+
89
+ } // extern "C"
90
+ """
91
+
92
+
93
+ def get_rng_states (size_output , seed = 1 ):
94
+ init_rng_src = """
95
+ #include <curand_kernel.h>
96
+
97
+ extern "C"
98
+ {
99
+
100
+ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
101
+ {
102
+ int id = blockIdx.x*blockDim.x + threadIdx.x;
103
+
104
+ if (id >= nthreads)
105
+ return;
106
+ curand_init(seed, id, offset, &s[id]);
107
+ }
108
+
109
+ __global__ void make_rand(int nthreads, curandState *state, int *randArray)
110
+ {
111
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
112
+ int id_rng = blockIdx.x;
113
+ double mean = 10;
114
+ if (idx<= nthreads){
115
+ randArray[idx] = curand_poisson(&state[id_rng], mean);
116
+ //randArray[idx] = curand_uniform(&state[idx]);
117
+ }
118
+ }
119
+
120
+ } // extern "C"
121
+ """
122
+
123
+ "Return `size_rng` number of CUDA random number generator states."
124
+ curr_gpu .make_context ()
125
+ gpu_vect_rand = ga .GPUArray ((size_output ,),dtype = np .int32 )
126
+ cpu_vect_rand = np .ones ((size_output ,),dtype = np .int32 )
127
+ (free ,total )= cuda .mem_get_info ()
128
+ print ("Global memory occupancy:%f%% free" % (free * 100 / total ))
129
+
130
+ # module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True,arch="sm_30")
131
+ module = pycuda .compiler .SourceModule (init_rng_src , no_extern_c = True )
132
+ init_rng = module .get_function ('init_rng' )
133
+ make_rand = module .get_function ('make_rand' )
134
+ size_block = 1024
135
+ n_blocks = size_output // size_block + 1
136
+ rng_states = cuda .mem_alloc (n_blocks * characterize .sizeof ('curandStateXORWOW' , '#include <curand_kernel.h>' ))
137
+ init_rng (np .int32 (n_blocks ), rng_states , np .uint64 (seed ), np .uint64 (0 ), block = (64 , 1 , 1 ), grid = (n_blocks // 64 + 1 ,1 ))
138
+ try :
139
+ make_rand (np .int32 (size_output ), rng_states , gpu_vect_rand , block = (size_block , 1 , 1 ), grid = (n_blocks , 1 ))
140
+ except cuda .LogicError :
141
+ print "random number generation failed ..."
142
+
143
+ (free ,total )= cuda .mem_get_info ()
144
+ print ("Global memory occupancy:%f%% free" % (free * 100 / total ))
145
+ rng_states .free ()
146
+ gpu_vect_rand .get (ary = cpu_vect_rand )
147
+ cuda .Context .pop ()
148
+ return cpu_vect_rand
149
+
150
+
151
+ def scn (input_mat , n_iter ):
152
+ """
153
+ :param input_mat: matrice to normalize
154
+ """
155
+ scn_kernel = """
156
+
157
+ __global__ void sum_along_axis(float* input, float* vect_sum, int width , int axis)
158
+ {
159
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
160
+ int r1 = threadIdx.y + blockDim.y * blockIdx.y;
161
+ int coord = r0 * width + r1;
162
+ if ((r0<width) && (r1< width)) {
163
+ float val = input[coord];
164
+ int id = (axis == 0) * r0 + (axis == 1) * r1;
165
+ val = atomicAdd( &(vect_sum[id]), (float) val);
166
+ }
167
+ }
168
+
169
+ __global__ void norm_along_axis(float* input, float* vect_sum, int width, int axis)
170
+ {
171
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
172
+ int r1 = threadIdx.y + blockDim.y * blockIdx.y;
173
+ int coord = r0 * width + r1;
174
+ if ((r0<width) && (r1< width)) {
175
+ float val = input[coord];
176
+ int id = (axis == 0) * r0 + (axis == 1) * r1;
177
+ input[coord] = val / vect_sum[id];
178
+ }
179
+ }
180
+ __global__ void init_vect_sum(float* vect_sum, int width)
181
+ {
182
+ int r0 = threadIdx.x + blockDim.x * blockIdx.x;
183
+ if (r0 < width){
184
+ vect_sum[r0] = 0;
185
+ }
186
+ }
187
+ """
188
+ curr_gpu .make_context ()
189
+ # if input_mat.dtype != np.float32:
190
+ # input_mat = np.float32(input_mat)
191
+ (free , total )= cuda .mem_get_info ()
192
+ print ("Global memory occupancy:%f%% free" % (free * 100 / total ))
193
+ if free > 2 * input_mat .nbytes :
194
+ print " gpu mem space ok "
195
+
196
+ width_mat = np .int32 (input_mat .shape [0 ])
197
+ try :
198
+ n_elements = input_mat .shape [0 ]* input_mat .shape [1 ]
199
+ except IndexError :
200
+ n_elements = input_mat .shape [0 ]
201
+ n_elements = np .int32 (n_elements )
202
+ cpu_vect_sum = np .zeros ((width_mat ,), dtype = np .float32 )
203
+ gpu_vect_sum = ga .to_gpu (cpu_vect_sum )
204
+ gpu_data = ga .to_gpu (input_mat )
205
+ cpu_output = np .empty_like (input_mat )
206
+
207
+ # module = pycuda.compiler.SourceModule(scn_kernel,arch="sm_30")
208
+ module = pycuda .compiler .SourceModule (scn_kernel )
209
+ sum_along_axis = module .get_function ('sum_along_axis' )
210
+ norm_along_axis = module .get_function ('norm_along_axis' )
211
+ init_vect_sum = module .get_function ('init_vect_sum' )
212
+ size_block_x = 32
213
+ size_block_y = 32
214
+
215
+ n_blocks_x = int (width_mat )// (size_block_x ) + 1
216
+ n_blocks_y = int (width_mat )// (size_block_y ) + 1
217
+
218
+
219
+ print "size block = " , size_block_x
220
+ print "n blocks x= " , n_blocks_x
221
+ print "n blocks y= " , n_blocks_y
222
+
223
+ print "gpu data type = " , gpu_data .dtype
224
+ print "gpu vect sum type = " , gpu_vect_sum .dtype
225
+ print "n element type = " , n_elements .dtype
226
+ print "width_mat type = " , width_mat .dtype
227
+
228
+
229
+
230
+ for i in xrange (0 , n_iter ):
231
+ id_axis = np .int32 (np .mod (i , 2 ))
232
+ print "id_axis= " , id_axis
233
+ sum_along_axis ( gpu_data , gpu_vect_sum , width_mat , id_axis , block = (size_block_x , size_block_y , 1 ),
234
+ grid = (n_blocks_x , n_blocks_y ), shared = 0 )
235
+
236
+ norm_along_axis (gpu_data , gpu_vect_sum , width_mat , id_axis , block = (size_block_x , size_block_y , 1 ),
237
+ grid = (n_blocks_x , n_blocks_y ), shared = 0 )
238
+ init_vect_sum (gpu_vect_sum ,block = (64 ,1 ,1 ),grid = (int (width_mat )// 64 + 1 , 1 ))
239
+ # gpu_vect_sum = ga.to_gpu(cpu_vect_sum)
240
+
241
+ gpu_data .get (ary = cpu_output )
242
+ # gpu_data.free()
243
+ cuda .Context .pop ()
244
+ return cpu_output
245
+
246
+
247
+ class randomize ():
248
+ def __init__ (self ,init_data , n_generators ):
249
+
250
+ self .ctx = curr_gpu .make_context ()
251
+ self .module = pycuda .compiler .SourceModule (kernels_cuda_src , no_extern_c = True )
252
+ (free ,total )= cuda .mem_get_info ()
253
+ print ("Global memory occupancy:%f%% free" % (free * 100 / total ))
254
+ print ("Global free memory :%i Mo free" % (free / 10 ** 6 ))
255
+
256
+ ################################################################################################################
257
+
258
+ self .width_mat = np .int32 (init_data .shape [0 ])
259
+ # self.gpu_init_data = ga.to_gpu(init_data)
260
+ self .gpu_init_data = cuda .mem_alloc (init_data .nbytes )
261
+ cuda .memcpy_htod (self .gpu_init_data , init_data )
262
+
263
+ self .cpu_new_data = np .zeros_like (init_data ,dtype = np .float32 )
264
+ print "size new data = " ,self .cpu_new_data .nbytes / 10 ** 6
265
+ (free ,total )= cuda .mem_get_info ()
266
+ print ("Global memory occupancy:%f%% free" % (free * 100 / total ))
267
+ print ("Global free memory :%i Mo free" % (free / 10 ** 6 ))
268
+
269
+ self .gpu_new_data = cuda .mem_alloc (self .cpu_new_data .nbytes )
270
+ cuda .memcpy_htod (self .gpu_new_data , self .cpu_new_data )
271
+ # self.gpu_new_data = ga.to_gpu(self.cpu_new_data)
272
+
273
+ self .cpu_vect_sum = np .zeros ((self .width_mat ,), dtype = np .float32 )
274
+ self .gpu_vect_sum = cuda .mem_alloc (self .cpu_vect_sum .nbytes )
275
+ cuda .memcpy_htod (self .gpu_vect_sum , self .cpu_vect_sum )
276
+ # self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
277
+ ################################################################################################################
278
+ self .init_rng = self .module .get_function ('init_rng' )
279
+ self .gen_rand_mat = self .module .get_function ('gen_rand_mat' )
280
+ self .sum_along_axis = self .module .get_function ('sum_along_axis' )
281
+ self .norm_along_axis = self .module .get_function ('norm_along_axis' )
282
+ self .init_vect_sum = self .module .get_function ('init_vect_sum' )
283
+ self .copy_mat = self .module .get_function ('copy_mat' )
284
+ ################################################################################################################
285
+ self .n_generators = n_generators
286
+ seed = 1
287
+ self .rng_states = cuda .mem_alloc (n_generators * characterize .sizeof ('curandStateXORWOW' ,
288
+ '#include <curand_kernel.h>' ))
289
+ self .init_rng (np .int32 (n_generators ), self .rng_states , np .uint64 (seed ), np .uint64 (0 ), block = (64 , 1 , 1 ),
290
+ grid = (n_generators // 64 + 1 ,1 ))
291
+ (free ,total )= cuda .mem_get_info ()
292
+
293
+ size_block_x = 32
294
+ size_block_y = 32
295
+ n_blocks_x = int (self .width_mat )// (size_block_x ) + 1
296
+ n_blocks_y = int (self .width_mat )// (size_block_y ) + 1
297
+ self .grid = (n_blocks_x , n_blocks_y , 1 )
298
+ self .block = (size_block_x , size_block_y , 1 )
299
+
300
+
301
+
302
+ def generate_new_matrix (self ,n_iter , do_random , fact_sub_sampling ):
303
+ # fact_sub_sampling = np.float32(0.5)
304
+ if do_random :
305
+ self .gen_rand_mat (self .gpu_init_data , self .gpu_new_data , self .rng_states , np .int32 (self .n_generators ),
306
+ self .width_mat , np .float32 (fact_sub_sampling ),
307
+ block = self .block , grid = self .grid , shared = 0 )
308
+ else :
309
+ self .copy_mat (self .gpu_init_data , self .gpu_new_data , self .width_mat ,np .float32 (fact_sub_sampling ),
310
+ block = self .block , grid = self .grid , shared = 0 )
311
+ print "matrix generated"
312
+ for i in xrange (0 , n_iter ):
313
+ # print "iter,",i
314
+ id_axis = np .int32 (np .mod (i , 2 ))
315
+ self .sum_along_axis (self .gpu_new_data , self .gpu_vect_sum , self .width_mat , id_axis ,
316
+ block = self .block , grid = self .grid , shared = 0 )
317
+
318
+ self .norm_along_axis (self .gpu_new_data , self .gpu_vect_sum , self .width_mat , id_axis ,
319
+ block = self .block , grid = self .grid , shared = 0 )
320
+
321
+ self .init_vect_sum (self .gpu_vect_sum , block = (64 ,1 ,1 ), grid = (int (self .width_mat )// 64 + 1 , 1 ))
322
+
323
+ # self.gpu_new_data.get(ary=self.cpu_new_data)
324
+ cuda .memcpy_dtoh (self .cpu_new_data , self .gpu_new_data )
325
+ return self .cpu_new_data
326
+
327
+ def free_gpu (self ,):
328
+ self .rng_states .free ()
329
+ self .gpu_vect_sum .free ()
330
+ self .gpu_new_data .free ()
331
+ self .gpu_init_data .free ()
332
+ (free ,total )= cuda .mem_get_info ()
333
+ print ("Global memory occupancy after cleaning processes: %f%% free" % (free * 100 / total ))
334
+ print ("Global free memory :%i Mo free" % (free / 10 ** 6 ))
335
+ del self .module
336
+ self .ctx .detach ()
337
+ # self.ctx.pop()
338
+ # cuda.Context.pop()
339
+
340
+ if __name__ == '__main__' :
341
+
342
+ import numpy as np
343
+ a = np .ones ((10000 , 10000 ),dtype = np .int32 )
344
+ gen = randomize (a , 1000 )
345
+ out = gen .generate_new_matrix (10 )
346
+ gen .free_gpu ()
347
+ print out
0 commit comments