Skip to content
This repository has been archived by the owner on Oct 26, 2023. It is now read-only.

Nonstationary bandit #6

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ your weights against the true values for plotting results of various algorithms
* `BanditTenArmedRandomRandom-v0`: 10 armed bandit with random probabilities assigned to both payouts and rewards
* `BanditTenArmedUniformDistributedReward-v0`: 10 armed bandit with that always pays out with a reward selected from a uniform distribution
* `BanditTenArmedGaussian-v0`: 10 armed bandit mentioned on page 30 of [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0) (Sutton and Barto)

* `BanditTenArmedNonstationaryConstantGaussian-v0`: 10 armed non-stationary bandit mentioned on page 33 of [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0) (Sutton and Barto)
### Installation
```bash
git clone [email protected]:JKCooper2/gym-bandits.git
Expand Down
4 changes: 3 additions & 1 deletion gym_bandits/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .bandit import BanditTwoArmedHighHighFixed
from .bandit import BanditTwoArmedHighLowFixed
from .bandit import BanditTwoArmedLowLowFixed
from .bandit import BanditTenArmedNonstationaryConstantGaussian

environments = [['BanditTenArmedRandomFixed', 'v0'],
['BanditTenArmedRandomRandom', 'v0'],
Expand All @@ -16,7 +17,8 @@
['BanditTwoArmedDeterministicFixed', 'v0'],
['BanditTwoArmedHighHighFixed', 'v0'],
['BanditTwoArmedHighLowFixed', 'v0'],
['BanditTwoArmedLowLowFixed', 'v0']]
['BanditTwoArmedLowLowFixed', 'v0'],
['BanditTenArmedNonstationaryConstantGaussian', 'v0']]

for environment in environments:
register(
Expand Down
37 changes: 36 additions & 1 deletion gym_bandits/bandit.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,39 @@ def __init__(self, bandits=10):
for _ in range(bandits):
r_dist.append([np.random.normal(0, 1), 1])

BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)


class BanditTenArmedNonstationaryConstantGaussian(BanditEnv):
"""
10 armed bandit mentioned on page 33 of Sutton and Barto's
[Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)

After every step q*(a) is moved by an independent amount (step_size)
Actions always pay out
Mean of payout is pulled from a normal distribution (0, 1) (called q*(a))
Actual reward is drawn from a normal distribution (q*(a), 1)
"""
def __init__(self, step_size=0.01, bandits=10):
p_dist = np.full(bandits, 1)
r_dist = []

for _ in range(bandits):
r_dist.append([np.random.normal(0, 1), 1])

BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)

def updateDistributions(stepFunc):
def wrapper(action, *args, **kwargs):
results = stepFunc(action, *args, **kwargs)
for arm in self.r_dist:
if np.random.random_sample() < 0.5:
arm[0] += step_size
else:
arm[0] -= step_size

return results

return wrapper

self.step = updateDistributions(self.step)
22 changes: 22 additions & 0 deletions gym_bandits/scoreboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,28 @@
background="Described on page 30 of Sutton and Barto's [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)"
)

add_task(
id='BanditTenArmedNonstationaryConstantGaussian-v0',
group='bandits',
experimental=True,
contributor='MouseAndKeyboard',
summary="10 armed bandit with reward based on a Gaussian distribution with the distribution for each arm changing by a constant amount over time",
description="""
Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
and a reward distribution, which is the value or distribution of what the agent will be rewarded
the bandit does payout.

p_dist = [1] (* 10)
r_dist = [numpy.random.normal(0, 1), 1] (* 10)

After every step the reward mean for each arm is moved by an independent amount (step_size)
Every bandit always pays out
Each action has a reward mean (selected from a normal distribution with mean 0 and std 1), and the actual
reward returns is selected with a std of 1 around the selected mean
""",
background="Described on page 33 of Sutton and Barto's [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)"
)

add_task(
id='BanditTenArmedRandomRandom-v0',
group='bandits',
Expand Down