JKCooper2 · MouseAndKeyboard · Jan 15, 2020 · Jan 15, 2020 · Jan 15, 2020 · Jan 15, 2020
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ your weights against the true values for plotting results of various algorithms
 * `BanditTenArmedRandomRandom-v0`: 10 armed bandit with random probabilities assigned to both payouts and rewards
 * `BanditTenArmedUniformDistributedReward-v0`: 10 armed bandit with that always pays out with a reward selected from a uniform distribution
 * `BanditTenArmedGaussian-v0`: 10 armed bandit mentioned on page 30 of [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0) (Sutton and Barto)
-
+* `BanditTenArmedNonstationaryConstantGaussian-v0`: 10 armed non-stationary bandit mentioned on page 33 of [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0) (Sutton and Barto)
 ### Installation
 ```bash
 git clone [email protected]:JKCooper2/gym-bandits.git

diff --git a/gym_bandits/__init__.py b/gym_bandits/__init__.py
@@ -8,6 +8,7 @@
 from .bandit import BanditTwoArmedHighHighFixed
 from .bandit import BanditTwoArmedHighLowFixed
 from .bandit import BanditTwoArmedLowLowFixed
+from .bandit import BanditTenArmedNonstationaryConstantGaussian
 
 environments = [['BanditTenArmedRandomFixed', 'v0'],
                 ['BanditTenArmedRandomRandom', 'v0'],
@@ -16,7 +17,8 @@
                 ['BanditTwoArmedDeterministicFixed', 'v0'],
                 ['BanditTwoArmedHighHighFixed', 'v0'],
                 ['BanditTwoArmedHighLowFixed', 'v0'],
-                ['BanditTwoArmedLowLowFixed', 'v0']]
+                ['BanditTwoArmedLowLowFixed', 'v0'],
+                ['BanditTenArmedNonstationaryConstantGaussian', 'v0']]
 
 for environment in environments:
     register(

diff --git a/gym_bandits/bandit.py b/gym_bandits/bandit.py
@@ -124,4 +124,39 @@ def __init__(self, bandits=10):
         for _ in range(bandits):
             r_dist.append([np.random.normal(0, 1), 1])
 
-        BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
+        BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
+
+
+class BanditTenArmedNonstationaryConstantGaussian(BanditEnv):
+    """
+    10 armed bandit mentioned on page 33 of Sutton and Barto's
+    [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)
+
+    After every step q*(a) is moved by an independent amount (step_size)
+    Actions always pay out
+    Mean of payout is pulled from a normal distribution (0, 1) (called q*(a))
+    Actual reward is drawn from a normal distribution (q*(a), 1)
+    """
+    def __init__(self, step_size=0.01, bandits=10):
+        p_dist = np.full(bandits, 1)
+        r_dist = []
+
+        for _ in range(bandits):
+            r_dist.append([np.random.normal(0, 1), 1])
+
+        BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
+
+        def updateDistributions(stepFunc):
+            def wrapper(action, *args, **kwargs):
+                results = stepFunc(action, *args, **kwargs)
+                for arm in self.r_dist:
+                    if np.random.random_sample() < 0.5:
+                        arm[0] += step_size
+                    else:
+                        arm[0] -= step_size
+
+                return results
+
+            return wrapper
+
+        self.step = updateDistributions(self.step)
diff --git a/gym_bandits/scoreboard.py b/gym_bandits/scoreboard.py
@@ -96,6 +96,28 @@
     background="Described on page 30 of Sutton and Barto's [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)"
 )
 
+add_task(
+    id='BanditTenArmedNonstationaryConstantGaussian-v0',
+    group='bandits',
+    experimental=True,
+    contributor='MouseAndKeyboard',
+    summary="10 armed bandit with reward based on a Gaussian distribution with the distribution for each arm changing by a constant amount over time",
+    description="""
+    Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
+    and a reward distribution, which is the value or distribution of what the agent will be rewarded
+    the bandit does payout.
+
+    p_dist = [1] (* 10)
+    r_dist = [numpy.random.normal(0, 1), 1] (* 10)
+
+    After every step the reward mean for each arm is moved by an independent amount (step_size)
+    Every bandit always pays out
+    Each action has a reward mean (selected from a normal distribution with mean 0 and std 1), and the actual
+    reward returns is selected with a std of 1 around the selected mean
+    """,
+    background="Described on page 33 of Sutton and Barto's [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)"
+)
+
 add_task(
     id='BanditTenArmedRandomRandom-v0',
     group='bandits',