-
Notifications
You must be signed in to change notification settings - Fork 5
/
Q_learning_handover.m
161 lines (146 loc) · 4.46 KB
/
Q_learning_handover.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
clear all;close all;clc;
%% Create the matrix of TTT combination
% 16 different TTT values for LTE-to-VLC and VLC-to-LTE
TTT_LV = [0.0 0.04 0.08 0.16 0.32 0.64 1.28 2.56 5.12];
TTT_VL = TTT_LV;
N = length(TTT_LV)*length(TTT_VL);
n = sqrt(N);
% Create the matrix 16x16, each column corresponds to one possible value of
% TTT_LV and each row corresponds to one possible value of TTT_VL
for i = 1:N
[q,r] = quorem(sym(i),sym(n));
q=double(q);r=double(r);
if r~=0
q = q+1;
else
r = n;
end
TTT_comb(i,:) = [TTT_LV(q) TTT_VL(r)];
end
%% Create reward matrix for limiting the possible actions at each state (TTT combination)
% We have 9 possible actions:
%
% Increase TTT_LV by a single level: (i+1)
% Decrease TTT_LV by a single level: (i-1)
% Increase TTT_VL by a single level: (i-n)
% Decrease TTT_VL by a single level: (i+n)
% Increase both TTT_LV and TTT_VL by a single level: (i+n+1)
% Decrease both TTT_LV and TTT_VL by a single level: (i-n-1)
% Increase TTT_LV and decrease TTT_VL by a single level: (i-n+1)
% Increase TTT_VL and decrease TTT_LV by a single level: (i+n-1)
% No change to the current values of TTT_LV and TTT_VL: i
% 24 equal-length time periods
count=1;
for t = [12 24 48 96]
reward=ones(N,N,t);
for i=1:N
for j=1:N
for k=1:t
if j~=i+1 && j~=i-1 && j~=i-n && j~=i+n && j~=i+n+1 && j~=i-n-1 && j~=i-n+1 && j~=i+n-1 && j~=i
reward(i,j,k)=-Inf;
end
end
end
end
for i=1:n:N
for j=1:i+n
for k=1:t
if j==i+n-1 || j==i-1 || j==i-n-1
reward(i,j,k)=-Inf;
reward(j,i,k)=-Inf;
end
end
end
end
for i=1:N
for j=1:N
for k=1:t
if reward(i,j,k)>0
reward(i,j,k)=Func_of_Cal_reward_two_AP(TTT_comb(j,1),TTT_comb(j,2),k,t);
end
end
end
end
filename = sprintf('t%dsmall.mat',t);
save(filename);
disp(t)
%% Q-learning algorithm
% Initialize the Q-table with random values ~ N(0,1)
% Set learnning rate to 1 and discount factor to 0.9
% The maximum number of episodes is set to 50
q = randn(size(reward));
gamma = 0.9;
alpha = 1;
maxItr = 10000;
epsilon_initial = 0.2;
% cs -> current state
% ns -> next state
%
% Repeat until Convergence OR Maximum Iterations
for i=1:maxItr
epsilon = epsilon_initial/(1+i/500);
% Starting from start position
cs=73;
% Repeat for t times
for k=1:t
% possible actions for the chosen state
n_actions = find(reward(cs,:,k)>=0);
% choose an action at random with probability epsilon and set it as the
% next state
if rand(1)<epsilon
ns = n_actions(randi(length(n_actions)));
else
ns = n_actions(find(q(cs,n_actions,k)==max(q(cs,n_actions,k))));
if length(ns)>1
ns = ns(randi(length(ns)));
end
end
if k<t
% find all the possible actions for the selected state
n_actions = find(reward(ns,:,k+1)>=0);
% find the maximum q-value i.e, next state with best action
max_q = 0;
for j=1:length(n_actions)
max_q = max(max_q,q(ns,n_actions(j),k+1));
end
% Update q-values as per Bellman's equation
q(cs,ns,k)=reward(cs,ns,k)+gamma*max_q;
%fprintf('q(%d,%d)=%d\n',cs,ns,q(cs,ns));
else
q(cs,ns,k)=reward(cs,ns,k);
end
% Set current state as next state
cs=ns;
Throughput(k) = Func_of_Cal_reward_two_AP(TTT_comb(cs,1),TTT_comb(cs,2),k,t);
end
Throughput_Average(i,count) = mean(Throughput);
end
count=count+1;
end
figure
hold on
plot(1:maxItr,Throughput_Average(:,1),'r');
plot(1:maxItr,Throughput_Average(:,2),'b');
plot(1:maxItr,Throughput_Average(:,3),'g');
plot(1:maxItr,Throughput_Average(:,4),'k');
hold off
legend('t = 12','t = 24','t = 48','t = 96');
xlabel('Episode index');
ylabel('Average throughput (Mbps)');
grid on
box on
% cs =241;
% for k=1:t
% ns = find(q(cs,:,k)==max(q(cs,:,k)));
% if length(ns)>1
% ns = ns(randi(length(ns)));
% end
% Can_TTT_LV(k) = TTT_comb(ns,1);
% Can_TTT_VL(k) = TTT_comb(ns,2);
% cs = ns;
% end
% figure
% hold on
% plot(1:t,Can_TTT_LV,'r');
% plot(1:t,Can_TTT_VL,'b');
% hold off