Skip to content

Commit 19ac4fa

Browse files
committed
first commit
0 parents  commit 19ac4fa

14 files changed

+5302
-0
lines changed

centro_main.py

+1,436
Large diffs are not rendered by default.

contig.py

+756
Large diffs are not rendered by default.

cuda_lib.py

+347
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
2+
__author__ = 'hervemn'
3+
import numpy as np
4+
import pycuda.tools
5+
from pycuda import characterize
6+
import pycuda.driver as cuda
7+
import pycuda.compiler
8+
from pycuda import gpuarray as ga
9+
cuda.init()
10+
num_gpu = cuda.Device.count()
11+
for i in range(0,num_gpu):
12+
tmp_dev = cuda.Device(i)
13+
print "device_id = ",i,tmp_dev.name()
14+
id_gpu = raw_input("Select GPU: ")
15+
id_gpu = int(id_gpu)
16+
curr_gpu = cuda.Device(id_gpu)
17+
print "you have selected ", curr_gpu.name()
18+
kernels_cuda_src = """
19+
#include <curand_kernel.h>
20+
21+
extern "C"
22+
{
23+
24+
__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
25+
{
26+
int id = blockIdx.x*blockDim.x + threadIdx.x;
27+
28+
if (id >= nthreads)
29+
return;
30+
curand_init(seed, id, offset, &s[id]);
31+
}
32+
33+
34+
__global__ void gen_rand_mat(int* initArray, float *randArray, curandState *state, int n_rng, int width,
35+
float fact_sub)
36+
{
37+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
38+
int r1 = threadIdx.y + blockDim.y * blockIdx.y;
39+
int coord = r0 * width + r1;
40+
int id_rng = coord % n_rng ;
41+
if ((r0<width) && (r1< width)) {
42+
float mean = initArray[coord] * fact_sub;
43+
randArray[coord] = curand_poisson(&state[id_rng], mean);
44+
}
45+
}
46+
47+
__global__ void copy_mat(int* initArray, float *randArray, int width, float fact_sub)
48+
{
49+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
50+
int r1 = threadIdx.y + blockDim.y * blockIdx.y;
51+
int coord = r0 * width + r1;
52+
if ((r0<width) && (r1< width)) {
53+
float mean = initArray[coord] * fact_sub;
54+
randArray[coord] = mean;
55+
}
56+
}
57+
58+
__global__ void sum_along_axis(float* input, float* vect_sum, int width , int axis)
59+
{
60+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
61+
int r1 = threadIdx.y + blockDim.y * blockIdx.y;
62+
int coord = r0 * width + r1;
63+
if ((r0<width) && (r1< width)) {
64+
float val = input[coord];
65+
int id = (axis == 0) * r0 + (axis == 1) * r1;
66+
val = atomicAdd( &(vect_sum[id]), (float) val);
67+
}
68+
}
69+
70+
__global__ void norm_along_axis(float* input, float* vect_sum, int width, int axis)
71+
{
72+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
73+
int r1 = threadIdx.y + blockDim.y * blockIdx.y;
74+
int coord = r0 * width + r1;
75+
if ((r0<width) && (r1< width)) {
76+
float val = input[coord];
77+
int id = (axis == 0) * r0 + (axis == 1) * r1;
78+
input[coord] = val / vect_sum[id];
79+
}
80+
}
81+
__global__ void init_vect_sum(float* vect_sum, int width)
82+
{
83+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
84+
if (r0 < width){
85+
vect_sum[r0] = 0;
86+
}
87+
}
88+
89+
} // extern "C"
90+
"""
91+
92+
93+
def get_rng_states(size_output, seed=1):
94+
init_rng_src = """
95+
#include <curand_kernel.h>
96+
97+
extern "C"
98+
{
99+
100+
__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
101+
{
102+
int id = blockIdx.x*blockDim.x + threadIdx.x;
103+
104+
if (id >= nthreads)
105+
return;
106+
curand_init(seed, id, offset, &s[id]);
107+
}
108+
109+
__global__ void make_rand(int nthreads, curandState *state, int *randArray)
110+
{
111+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
112+
int id_rng = blockIdx.x;
113+
double mean = 10;
114+
if (idx<= nthreads){
115+
randArray[idx] = curand_poisson(&state[id_rng], mean);
116+
//randArray[idx] = curand_uniform(&state[idx]);
117+
}
118+
}
119+
120+
} // extern "C"
121+
"""
122+
123+
"Return `size_rng` number of CUDA random number generator states."
124+
curr_gpu.make_context()
125+
gpu_vect_rand = ga.GPUArray((size_output,),dtype=np.int32)
126+
cpu_vect_rand = np.ones((size_output,),dtype=np.int32)
127+
(free,total)=cuda.mem_get_info()
128+
print("Global memory occupancy:%f%% free"%(free*100/total))
129+
130+
# module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True,arch="sm_30")
131+
module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
132+
init_rng = module.get_function('init_rng')
133+
make_rand = module.get_function('make_rand')
134+
size_block = 1024
135+
n_blocks = size_output//size_block+1
136+
rng_states = cuda.mem_alloc(n_blocks*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
137+
init_rng(np.int32(n_blocks), rng_states, np.uint64(seed), np.uint64(0), block=(64, 1, 1), grid=(n_blocks//64 + 1,1))
138+
try:
139+
make_rand(np.int32(size_output), rng_states, gpu_vect_rand, block=(size_block, 1, 1), grid=(n_blocks, 1))
140+
except cuda.LogicError:
141+
print "random number generation failed ..."
142+
143+
(free,total)=cuda.mem_get_info()
144+
print("Global memory occupancy:%f%% free"%(free*100/total))
145+
rng_states.free()
146+
gpu_vect_rand.get(ary=cpu_vect_rand)
147+
cuda.Context.pop()
148+
return cpu_vect_rand
149+
150+
151+
def scn(input_mat, n_iter):
152+
"""
153+
:param input_mat: matrice to normalize
154+
"""
155+
scn_kernel = """
156+
157+
__global__ void sum_along_axis(float* input, float* vect_sum, int width , int axis)
158+
{
159+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
160+
int r1 = threadIdx.y + blockDim.y * blockIdx.y;
161+
int coord = r0 * width + r1;
162+
if ((r0<width) && (r1< width)) {
163+
float val = input[coord];
164+
int id = (axis == 0) * r0 + (axis == 1) * r1;
165+
val = atomicAdd( &(vect_sum[id]), (float) val);
166+
}
167+
}
168+
169+
__global__ void norm_along_axis(float* input, float* vect_sum, int width, int axis)
170+
{
171+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
172+
int r1 = threadIdx.y + blockDim.y * blockIdx.y;
173+
int coord = r0 * width + r1;
174+
if ((r0<width) && (r1< width)) {
175+
float val = input[coord];
176+
int id = (axis == 0) * r0 + (axis == 1) * r1;
177+
input[coord] = val / vect_sum[id];
178+
}
179+
}
180+
__global__ void init_vect_sum(float* vect_sum, int width)
181+
{
182+
int r0 = threadIdx.x + blockDim.x * blockIdx.x;
183+
if (r0 < width){
184+
vect_sum[r0] = 0;
185+
}
186+
}
187+
"""
188+
curr_gpu.make_context()
189+
# if input_mat.dtype != np.float32:
190+
# input_mat = np.float32(input_mat)
191+
(free, total)=cuda.mem_get_info()
192+
print("Global memory occupancy:%f%% free"%(free*100/total))
193+
if free > 2 * input_mat.nbytes:
194+
print " gpu mem space ok "
195+
196+
width_mat = np.int32(input_mat.shape[0])
197+
try:
198+
n_elements = input_mat.shape[0]*input_mat.shape[1]
199+
except IndexError:
200+
n_elements = input_mat.shape[0]
201+
n_elements = np.int32(n_elements)
202+
cpu_vect_sum = np.zeros((width_mat,), dtype=np.float32)
203+
gpu_vect_sum = ga.to_gpu(cpu_vect_sum)
204+
gpu_data = ga.to_gpu(input_mat)
205+
cpu_output = np.empty_like(input_mat)
206+
207+
# module = pycuda.compiler.SourceModule(scn_kernel,arch="sm_30")
208+
module = pycuda.compiler.SourceModule(scn_kernel)
209+
sum_along_axis = module.get_function('sum_along_axis')
210+
norm_along_axis = module.get_function('norm_along_axis')
211+
init_vect_sum = module.get_function('init_vect_sum')
212+
size_block_x = 32
213+
size_block_y = 32
214+
215+
n_blocks_x = int(width_mat)//(size_block_x) + 1
216+
n_blocks_y = int(width_mat)//(size_block_y) + 1
217+
218+
219+
print "size block = ", size_block_x
220+
print "n blocks x= ", n_blocks_x
221+
print "n blocks y= ", n_blocks_y
222+
223+
print "gpu data type = ", gpu_data.dtype
224+
print "gpu vect sum type = ", gpu_vect_sum.dtype
225+
print "n element type = ", n_elements.dtype
226+
print "width_mat type = ", width_mat.dtype
227+
228+
229+
230+
for i in xrange(0, n_iter):
231+
id_axis = np.int32(np.mod(i, 2))
232+
print "id_axis= ", id_axis
233+
sum_along_axis( gpu_data, gpu_vect_sum, width_mat, id_axis, block=(size_block_x, size_block_y , 1),
234+
grid=(n_blocks_x, n_blocks_y), shared=0)
235+
236+
norm_along_axis(gpu_data, gpu_vect_sum, width_mat, id_axis, block=(size_block_x, size_block_y, 1),
237+
grid=(n_blocks_x, n_blocks_y), shared=0)
238+
init_vect_sum(gpu_vect_sum,block=(64,1,1),grid=(int(width_mat)//64 +1, 1))
239+
# gpu_vect_sum = ga.to_gpu(cpu_vect_sum)
240+
241+
gpu_data.get(ary=cpu_output)
242+
# gpu_data.free()
243+
cuda.Context.pop()
244+
return cpu_output
245+
246+
247+
class randomize():
248+
def __init__(self,init_data, n_generators):
249+
250+
self.ctx = curr_gpu.make_context()
251+
self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
252+
(free,total)=cuda.mem_get_info()
253+
print("Global memory occupancy:%f%% free"%(free*100/total))
254+
print("Global free memory :%i Mo free"%(free/10**6))
255+
256+
################################################################################################################
257+
258+
self.width_mat = np.int32(init_data.shape[0])
259+
# self.gpu_init_data = ga.to_gpu(init_data)
260+
self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
261+
cuda.memcpy_htod(self.gpu_init_data, init_data)
262+
263+
self.cpu_new_data = np.zeros_like(init_data,dtype=np.float32)
264+
print "size new data = ",self.cpu_new_data.nbytes/10**6
265+
(free,total)=cuda.mem_get_info()
266+
print("Global memory occupancy:%f%% free"%(free*100/total))
267+
print("Global free memory :%i Mo free"%(free/10**6))
268+
269+
self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
270+
cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
271+
# self.gpu_new_data = ga.to_gpu(self.cpu_new_data)
272+
273+
self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
274+
self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
275+
cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
276+
# self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
277+
################################################################################################################
278+
self.init_rng = self.module.get_function('init_rng')
279+
self.gen_rand_mat = self.module.get_function('gen_rand_mat')
280+
self.sum_along_axis = self.module.get_function('sum_along_axis')
281+
self.norm_along_axis = self.module.get_function('norm_along_axis')
282+
self.init_vect_sum = self.module.get_function('init_vect_sum')
283+
self.copy_mat = self.module.get_function('copy_mat')
284+
################################################################################################################
285+
self.n_generators = n_generators
286+
seed = 1
287+
self.rng_states = cuda.mem_alloc(n_generators*characterize.sizeof('curandStateXORWOW',
288+
'#include <curand_kernel.h>'))
289+
self.init_rng(np.int32(n_generators), self.rng_states, np.uint64(seed), np.uint64(0), block=(64, 1, 1),
290+
grid=(n_generators//64 + 1,1))
291+
(free,total)=cuda.mem_get_info()
292+
293+
size_block_x = 32
294+
size_block_y = 32
295+
n_blocks_x = int(self.width_mat)//(size_block_x) + 1
296+
n_blocks_y = int(self.width_mat)//(size_block_y) + 1
297+
self.grid = (n_blocks_x, n_blocks_y, 1)
298+
self.block = (size_block_x, size_block_y, 1)
299+
300+
301+
302+
def generate_new_matrix(self,n_iter, do_random, fact_sub_sampling):
303+
# fact_sub_sampling = np.float32(0.5)
304+
if do_random:
305+
self.gen_rand_mat(self.gpu_init_data, self.gpu_new_data, self.rng_states, np.int32(self.n_generators),
306+
self.width_mat, np.float32(fact_sub_sampling),
307+
block=self.block, grid=self.grid, shared=0)
308+
else:
309+
self.copy_mat(self.gpu_init_data, self.gpu_new_data, self.width_mat,np.float32(fact_sub_sampling),
310+
block=self.block, grid=self.grid, shared=0)
311+
print "matrix generated"
312+
for i in xrange(0, n_iter):
313+
# print "iter,",i
314+
id_axis = np.int32(np.mod(i, 2))
315+
self.sum_along_axis(self.gpu_new_data, self.gpu_vect_sum, self.width_mat, id_axis,
316+
block=self.block, grid=self.grid, shared=0)
317+
318+
self.norm_along_axis(self.gpu_new_data, self.gpu_vect_sum, self.width_mat, id_axis,
319+
block=self.block, grid=self.grid, shared=0)
320+
321+
self.init_vect_sum(self.gpu_vect_sum, block=(64,1,1), grid=(int(self.width_mat)//64 +1, 1))
322+
323+
# self.gpu_new_data.get(ary=self.cpu_new_data)
324+
cuda.memcpy_dtoh(self.cpu_new_data, self.gpu_new_data)
325+
return self.cpu_new_data
326+
327+
def free_gpu(self,):
328+
self.rng_states.free()
329+
self.gpu_vect_sum.free()
330+
self.gpu_new_data.free()
331+
self.gpu_init_data.free()
332+
(free,total)=cuda.mem_get_info()
333+
print("Global memory occupancy after cleaning processes: %f%% free"%(free*100/total))
334+
print("Global free memory :%i Mo free"%(free/10**6))
335+
del self.module
336+
self.ctx.detach()
337+
# self.ctx.pop()
338+
# cuda.Context.pop()
339+
340+
if __name__ == '__main__':
341+
342+
import numpy as np
343+
a = np.ones((10000, 10000),dtype=np.int32)
344+
gen = randomize(a, 1000)
345+
out = gen.generate_new_matrix(10)
346+
gen.free_gpu()
347+
print out

distributed_scn.py

Whitespace-only changes.

0 commit comments

Comments
 (0)