-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.cu
105 lines (78 loc) · 2.76 KB
/
app.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include <iostream>
#include "cudalist.cuh"
#define GRIDDIM 256
#define BLOCKDIM 64
__global__
void internal_memory(float * result) {
int thid = blockDim.x*blockIdx.x+threadIdx.x;
float memory[BLOCKDIM];
culist<float, int> list(memory, blockDim.x);
for (int i = 0; i < 100000; ++i) {
for (int m = 0; m < blockDim.x/2; ++m) {
list.push_front(threadIdx.x+m);
list.push_back(threadIdx.x+m);
}
for (int m = 0; m < blockDim.x/2; ++m) {
list.pop_front();
list.pop_back();
}
}
for (int m = 0; m < blockDim.x/2; ++m) {
list.push_front(threadIdx.x+m);
list.push_back(threadIdx.x+m);
}
result[thid] = list[0];
}
__global__
void external_memory(float * result, float * memory) {
int thid = blockDim.x*blockIdx.x+threadIdx.x;
culist<float, int> list(memory+thid*blockDim.x, blockDim.x);
for (int i = 0; i < 100000; ++i) {
for (int m = 0; m < blockDim.x/2; ++m) {
list.push_front(threadIdx.x+m);
list.push_back(threadIdx.x+m);
}
for (int m = 0; m < blockDim.x/2; ++m) {
list.pop_front();
list.pop_back();
}
}
for (int m = 0; m < blockDim.x/2; ++m) {
list.push_front(threadIdx.x+m);
list.push_back(threadIdx.x+m);
}
result[thid] = list[0];
}
__global__
void shared_memory(float * result) {
int thid = blockDim.x*blockIdx.x+threadIdx.x;
__shared__ float memory [BLOCKDIM*BLOCKDIM];
culist<float, int> list(memory+threadIdx.x*blockDim.x, blockDim.x);
for (int i = 0; i < 100000; ++i) {
for (int m = 0; m < blockDim.x/2; ++m) {
list.push_front(threadIdx.x+m);
list.push_back(threadIdx.x+m);
}
for (int m = 0; m < blockDim.x/2; ++m) {
list.pop_front();
list.pop_back();
}
}
for (int m = 0; m < blockDim.x/2; ++m) {
list.push_front(threadIdx.x+m);
list.push_back(threadIdx.x+m);
}
result[thid] = list[0];
}
int main() {
float *Memory = NULL, *Result = NULL, *result = new float[GRIDDIM*BLOCKDIM];
cudaMalloc(&Memory, sizeof(float)*GRIDDIM*BLOCKDIM*BLOCKDIM);
cudaMalloc(&Result, sizeof(float)*GRIDDIM*BLOCKDIM);
internal_memory<<<GRIDDIM, BLOCKDIM>>>(Result); // fastest
//external_memory<<<GRIDDIM, BLOCKDIM>>>(Result, Memory); // meh
//shared_memory<<<GRIDDIM, BLOCKDIM>>>(Result); // meh^2
cudaMemcpy(result, Result, sizeof(float)*GRIDDIM*BLOCKDIM,
cudaMemcpyDeviceToHost);
for (int m = 0; m < GRIDDIM*BLOCKDIM; ++m)
std::cout << result[m] << std::endl;
}