-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathref_parallel-dot-product-atomics.cu
105 lines (83 loc) · 3.59 KB
/
ref_parallel-dot-product-atomics.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
//Reference implementation of reduction with dot product
//#include <cuda_runtime.h> // automatically added by nvcc
#include <vector>
#include <iostream>
typedef float real_t;
const size_t BLOCK_SIZE = 16;
__global__ void full_dot( const real_t* v1, const real_t* v2, real_t* out, int N ) {
__shared__ real_t cache[ BLOCK_SIZE ];
int i = blockIdx.x * blockDim.x + threadIdx.x;
cache[ threadIdx.x ] = 0.f;
while( i < N ) {
cache[ threadIdx.x ] += v1[ i ] * v2[ i ];
i += gridDim.x * blockDim.x;
}
i = BLOCK_SIZE / 2;
while( i > 0 ) {
if( threadIdx.x < i ) cache[ threadIdx.x ] += cache[ threadIdx.x + i ];
__syncthreads();
i /= 2; //not sure bitwise operations are actually faster
}
#ifndef NO_SYNC // serialized access to shared data;
if( threadIdx.x == 0 ) atomicAdd( out, cache[ 0 ] );
#else // no sync, what most likely happens is:
// 1) all threads read 0
// 2) all threads write concurrently 16 (local block dot product)
if( threadIdx.x == 0 ) *out += cache[ 0 ];
#endif
}
real_t dot( const real_t* v1, const real_t* v2, int N ) {
real_t s = 0;
for( int i = 0; i != N; ++i ) {
s += v1[ i ] * v2[ i ];
}
return s;
}
__global__ void init_vector( real_t* v, int N ) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
while( i < N ) {
v[ i ] = 1.0f;//real_t( i ) / 1000000.f;
i += gridDim.x * blockDim.x;
}
}
//------------------------------------------------------------------------------
int main(int argc, char** argv ) {
const size_t ARRAY_SIZE = 1024;//1024 * 1024; //1Mi elements
const int BLOCKS = 64;//512;
const int THREADS_PER_BLOCK = BLOCK_SIZE;//256; // total threads = 512 x 256 = 128ki threads;
const size_t SIZE = ARRAY_SIZE * sizeof( real_t );
// device storage
real_t* dev_v1 = 0; // vector 1
real_t* dev_v2 = 0; // vector 2
real_t* dev_out = 0; // result
cudaMalloc( &dev_v1, SIZE );
cudaMalloc( &dev_v2, SIZE );
cudaMalloc( &dev_out, sizeof( real_t ) );
// host storage
std::vector< real_t > host_v1( ARRAY_SIZE );
std::vector< real_t > host_v2( ARRAY_SIZE );
real_t host_out = 0.f;
// initialize vector 1 with kernel; much faster than using for loops on the cpu
init_vector<<< 1024, 256 >>>( dev_v1, ARRAY_SIZE );
cudaMemcpy( &host_v1[ 0 ], dev_v1, SIZE, cudaMemcpyDeviceToHost );
// initialize vector 2 with kernel; much faster than using for loops on the cpu
init_vector<<< 1024, 256 >>>( dev_v2, ARRAY_SIZE );
cudaMemcpy( &host_v2[ 0 ], dev_v2, SIZE, cudaMemcpyDeviceToHost );
// initialize result on GPU: note the use of cudaMemset, alternatives are to run a kernel
// or copy from CPU
cudaMemset( dev_out, 0, sizeof( real_t) );
// execute kernel
full_dot<<<BLOCKS, THREADS_PER_BLOCK>>>( dev_v1, dev_v2, dev_out, ARRAY_SIZE );
std::cout << cudaGetErrorString( cudaGetLastError() ) << std::endl;
// copy output data from device(gpu) to host(cpu)
cudaMemcpy( &host_out, dev_out, sizeof( real_t ), cudaMemcpyDeviceToHost );
// print dot product by summing up the partially reduced vectors
std::cout << "GPU: " << host_out << std::endl;
// print dot product on cpu
std::cout << "CPU: " << dot( &host_v1[ 0 ], &host_v2[ 0 ], ARRAY_SIZE ) << std::endl;
// free memory
cudaFree( dev_v1 );
cudaFree( dev_v2 );
cudaFree( dev_out );
return 0;
}