forked from OptMLGroup/DeepBeerInventory-RL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BGAgent.py
141 lines (124 loc) · 7.17 KB
/
BGAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from SRDQN import DQN
import numpy as np
# Here we want to define the agent class for the BeerGame
class Agent(object):
# initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents.
def __init__(self, agentNum, IL, AO, AS, c_h, c_p, eta, compuType, config):
self.agentNum = agentNum
self.IL = IL # Inventory level of each agent - changes during the game
self.OO = 0 # Open order of each agent - changes during the game
self.ASInitial = AS # the initial arriving shipment.
self.ILInitial = IL # IL at which we start each game with this number
self.AOInitial = AO # OO at which we start each game with this number
self.config = config # an instance of config is stored inside the class
self.curState = [] # this function gets the current state of the game
self.nextState = []
self.curReward = 0 # the reward observed at the current step
self.cumReward = 0 # cumulative reward; reset at the begining of each episode
self.totRew = 0 # it is reward of all players obtained for the current player.
self.c_h=c_h # holding cost
self.c_p = c_p # backorder cost
self.eta = eta # the total cost regulazer
self.AS = np.zeros((1,1)) # arriced shipment
self.AO = np.zeros((1,1)) # arrived order
self.action=0 # the action at time t
self.compTypeTrain = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
self.alpha_b = self.config.alpha_b[self.agentNum] # parameters for the formula
self.betta_b = self.config.betta_b[self.agentNum] # parameters for the formula
if self.config.demandDistribution == 0:
self.a_b = np.mean((self.config.demandUp , self.config.demandLow)) # parameters for the formula
self.b_b = np.mean((self.config.demandUp , self.config.demandLow))*(np.mean((self.config.leadRecItemLow[self.agentNum] ,
self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4:
self.a_b = self.config.demandMu # parameters for the formula
self.b_b = self.config.demandMu*(np.mean((self.config.leadRecItemLow[self.agentNum] ,
self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
elif self.config.demandDistribution == 2:
self.a_b = 8 # parameters for the formula
self.b_b = (3/4.)*8*(np.mean((self.config.leadRecItemLow[self.agentNum] ,
self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
elif self.config.demandDistribution == 3:
self.a_b = 10 # parameters for the formula
self.b_b = 7*(np.mean((self.config.leadRecItemLow[self.agentNum] ,
self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
self.hist = [] # this is used for plotting - keeps the history for only one game
self.hist2 = [] # this is used for animation usage
self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
self.T = 0
self.bsBaseStock = 0
self.init_bsBaseStock = 0
self.nextObservation = []
if self.compTypeTrain == 'srdqn':
self.brain = DQN(self.agentNum,config)
self.brain.setInitState(self.curState) # sets the initial input of the network
# reset player information
def resetPlayer(self, T):
self.IL = self.ILInitial
self.OO = 0
self.AS = np.squeeze(np.zeros((1,T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10 ))) # arriced shipment
self.AO = np.squeeze(np.zeros((1,T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10 ))) # arrived order
if self.agentNum != 0:
for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]):
self.AO[i] = self.AOInitial[self.agentNum - 1]
for i in range(self.config.leadRecItemUp[self.agentNum]):
self.AS[i] = self.ASInitial
self.curReward = 0 # the reward observed at the current step
self.cumReward = 0 # cumulative reward; reset at the begining of each episode
self.action= []
self.hist = []
self.hist2 = []
self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
self.T = T
self.curObservation = self.getCurState(1) # this function gets the current state of the game
self.nextObservation = []
if self.compTypeTrain == 'srdqn':
self.brain.setInitState(self.curObservation) # sets the initial input of the network
# updates the IL and OO at time t, after recieving "rec" number of items
def recieveItems(self, time):
self.IL = self.IL + self.AS[time] # inverntory level update
self.OO = self.OO - self.AS[time] # invertory in transient update
# find action Value associated with the action list
def actionValue(self,curTime,playType):
if playType == "test":
if self.config.fixedAction:
a = self.config.actionList[np.argmax(self.action)]
else:
# "d + x" rule
if self.compTypeTest == 'srdqn':
a = max(0, self.config.actionList[np.argmax(self.action)]*self.config.action_step + self.AO[curTime])
elif self.compTypeTest == 'rnd':
a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
else:
a = max(0, self.config.actionListOpt[np.argmax(self.action)])
elif playType == "train":
if self.config.fixedAction:
a = self.config.actionList[np.argmax(self.action)]
else:
if self.compTypeTrain == 'srdqn':
a = max(0, self.config.actionList[np.argmax(self.action)]*self.config.action_step + self.AO[curTime])
elif self.compTypeTest == 'rnd':
a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
else:
a = max(0, self.config.actionListOpt[np.argmax(self.action)])
return a
# getReward returns the reward at the current state
def getReward(self):
# cost (holding + backorder) for one time unit
self.curReward= (self.c_p * max(0,-self.IL) + self.c_h * max(0,self.IL))/200. # self.config.Ttest #
self.curReward = -self.curReward; # make reward negative, because it is the cost
# sum total reward of each agent
self.cumReward = self.config.gamma*self.cumReward + self.curReward
# This function returns a np.array of the current state of the agent
def getCurState(self,t):
if self.config.ifUseASAO:
if self.config.if_use_AS_t_plus_1:
curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO,self.AS[t],self.AO[t]])
else:
curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO,self.AS[t-1],self.AO[t]])
else:
curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO])
if self.config.ifUseActionInD:
a = self.config.actionList[np.argmax(self.action)]
curState= np.concatenate((curState, np.array([a])))
return curState