From bf4c4dc3e7f496e03767659674c44446044ac94c Mon Sep 17 00:00:00 2001 From: Ryan Enderby Date: Mon, 1 Oct 2018 14:12:58 -0400 Subject: [PATCH 1/2] adding k-bandit implementation (a one state MPD) --- MDP/k-bandit.ipynb | 352 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 352 insertions(+) create mode 100644 MDP/k-bandit.ipynb diff --git a/MDP/k-bandit.ipynb b/MDP/k-bandit.ipynb new file mode 100644 index 000000000..87dc87003 --- /dev/null +++ b/MDP/k-bandit.ipynb @@ -0,0 +1,352 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Types of gym_bandits\n", + "`BanditTwoArmedDeterministicFixed-v0`: Simplest case where one bandit always pays, and the other always doesn't\n", + "\n", + "`BanditTwoArmedHighLowFixed-v0`: Stochastic version with a large difference between which bandit pays out of two choices\n", + "\n", + "`BanditTwoArmedHighHighFixed-v0`: Stochastic version with a small difference between which bandit pays where both are good\n", + "\n", + "`BanditTwoArmedLowLowFixed-v0`: Stochastic version with a small difference between which bandit pays where both are bad\n", + "\n", + "`BanditTenArmedRandomFixed-v0`: 10 armed bandit with random probabilities assigned to payouts\n", + "\n", + "`BanditTenArmedRandomRandom-v0`: 10 armed bandit with random probabilities assigned to both payouts and rewards\n", + "\n", + "`BanditTenArmedUniformDistributedReward-v0`: 10 armed bandit with that always pays out with a reward selected from a uniform distribution\n", + "\n", + "`BanditTenArmedGaussian-v0`: 10 armed bandit mentioned on page 30 of Reinforcement Learning: An Introduction (Sutton and Barto)\n", + "\n", + "----------\n", + "\n", + "#### References\n", + "- gym_bandits library from https://github.com/JKCooper2/gym-bandits\n", + "- moving average function https://github.com/klangner/rl-examples/blob/master/notebooks/openai-gym/%5Bnot%20solved%5D%20CartPole.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": 352, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gym_bandits\n", + "import gym\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 353, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2018-01-24 14:06:40,096] Making new env: BanditTenArmedGaussian-v0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 353, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env = gym.make(\"BanditTenArmedGaussian-v0\")\n", + "env.reset()" + ] + }, + { + "cell_type": "code", + "execution_count": 354, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Helper functions\n", + "\n", + "def moving_average(xs, n=100):\n", + " ret = np.cumsum(xs, dtype=float)\n", + " ret[n:] = ret[n:] - ret[:-n]\n", + " return ret[n - 1:] / n\n", + "\n", + "def divide_by_index(x): \n", + " output = np.zeros(len(x))\n", + " for n in range(len(x)):\n", + " output[n] = float(x[n]) / (n + 1)\n", + " return output\n", + "\n", + "def avg_across_arrays(xs): # where xs is an array of arrays, all of same length\n", + " length = len(xs[0])\n", + " num_arrays = len(xs)\n", + " output = np.zeros(length)\n", + " for j in range(length):\n", + " xsum = 0.\n", + " for i in range(num_arrays):\n", + " xsum += xs[i][j]\n", + " output[j] = xsum / num_arrays\n", + " return output\n", + "\n", + "def softmax(x):\n", + " return np.exp(x - x.max()) / np.sum(np.exp(x - x.max())) # slightly modified to mitigate chances of overflow" + ] + }, + { + "cell_type": "code", + "execution_count": 355, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Defining the agents\n", + "\n", + "class SimpleAgent(object):\n", + " def __init__(self, num_actions, Q_init=0):\n", + " self.num_actions = num_actions\n", + " self.A = range(num_actions)\n", + " self.N = np.zeros(num_actions) # the number of times N(a) has been called\n", + " self.Q = np.full(num_action, Q_init) # the expected value of choosing Q(a)\n", + " \n", + " def choose_action(self, epsilon): # choose random action with probability epsilon \n", + " rand = np.random.rand()\n", + " if (rand < epsilon):\n", + " choice = np.random.choice(self.A)\n", + " return choice\n", + " else:\n", + " choice = np.argmax(self.Q)\n", + " return choice\n", + " \n", + " def learn(self, action, reward): # update Q and N\n", + " self.N[a] += 1\n", + " self.Q[a] += (1 / self.N[a]) * (reward - self.Q[a])\n", + " \n", + " def alpha_learn(self, action, reward, alpha=0.1): # update Q and N\n", + " self.N[a] += 1\n", + " self.Q[a] += alpha * (reward - self.Q[a])\n", + "\n", + " \n", + "class GradientAgent(SimpleAgent):\n", + " def __init__(self, num_actions, Q_init=0):\n", + " self.num_actions = num_actions\n", + " self.A = range(num_actions)\n", + " self.H = softmax(np.zeros(num_actions))\n", + " self.avg_reward = 0\n", + " self.timestep = 0\n", + " \n", + " def choose_action(self): \n", + " return np.random.choice(self.num_actions, p=softmax(self.H))\n", + " \n", + " def learn(self, action, reward, alpha=0.4):\n", + " self.timestep += 1\n", + " self.avg_reward = self.avg_reward + (1. / self.timestep) * (reward - self.avg_reward)\n", + " for a in self.A:\n", + " if a == action:\n", + " self.H[a] = self.H[a] + alpha * (reward - self.avg_reward) * (1 - softmax(self.H)[a])\n", + " else:\n", + " self.H[a] = self.H[a] - alpha * (reward - self.avg_reward) * (softmax(self.H)[a])\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 356, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "optimal choice is action at index: 4\n", + "reward distribution: [-1.2717412660464487, 0.8109740983565349, -1.6815563579207893, -1.0818436030941483, 1.492005935762376, -1.269804656864728, 0.5953374965613057, 0.2654389056279407, 1.0576079326748915, -0.8055287044820053]\n" + ] + } + ], + "source": [ + "# Get reward distribution\n", + "\n", + "r_dist = list(map(lambda x: x[0], env.env.r_dist))\n", + " \n", + "optimal_choice = np.argmax(r_dist)\n", + "print(\"optimal choice is action at index:\", optimal_choice)\n", + "print(\"reward distribution:\", r_dist)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 357, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Event loop for i episodes each with t timesteps\n", + "\n", + "timesteps = 1000\n", + "episodes = 100\n", + "\n", + "agent = GradientAgent(env.action_space.n)\n", + "e = 0.1 # epsilon factor for exploration\n", + "\n", + "all_optimal_trackers = []\n", + "all_total_rewards = []\n", + "all_rewards = []\n", + "\n", + "for i in range(episodes):\n", + " \n", + " optimal_picks = 0\n", + " optimal_tracker = []\n", + " \n", + " total_reward = 0\n", + " total_rewards = []\n", + " rewards = []\n", + " \n", + " for t in range(timesteps):\n", + " a = agent.choose_action()\n", + "\n", + " if a == optimal_choice: optimal_picks += 1\n", + " optimal_tracker.append(optimal_picks)\n", + "\n", + " observation, reward, done, info = env.step(a)\n", + "\n", + " total_reward += reward\n", + " rewards.append(reward)\n", + " total_rewards.append(total_reward)\n", + " agent.learn(a, reward)\n", + " \n", + " all_optimal_trackers.append(divide_by_index(optimal_tracker))\n", + " all_total_rewards.append(divide_by_index(total_rewards))\n", + " all_rewards.append(moving_average(rewards))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 358, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Get averages across all episodes\n", + "\n", + "avg_rewards = avg_across_arrays(all_rewards)\n", + "avg_total_rewards = avg_across_arrays(all_total_rewards)\n", + "avg_optimal = avg_across_arrays(all_optimal_trackers)" + ] + }, + { + "cell_type": "code", + "execution_count": 359, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting\n", + "\n", + "plt.figure(1, figsize=(15,10))\n", + "\n", + "# probability distribution\n", + "plt.subplot(321)\n", + "plt.plot(env.env.p_dist, 'o')\n", + "plt.title('probability distribution of bandit arms')\n", + "plt.xlabel('action')\n", + "plt.ylabel('reward')\n", + "plt.grid(True)\n", + "\n", + "\n", + "# reward distribution\n", + "plt.subplot(322)\n", + "plt.plot(r_dist, 'ro')\n", + "plt.title('mean of reward distribution by action')\n", + "plt.xlabel('action')\n", + "plt.ylabel('reward')\n", + "plt.grid(True)\n", + "\n", + "\n", + "# moving average of rewards\n", + "plt.subplot(323)\n", + "plt.plot(avg_rewards)\n", + "plt.title('moving average of rewards')\n", + "plt.xlabel('timestep')\n", + "plt.ylabel('reward')\n", + "plt.grid(True)\n", + "\n", + "# moving average of total rewards\n", + "plt.subplot(324)\n", + "plt.plot(avg_total_rewards) # to get average reward\n", + "plt.title('average of total rewards')\n", + "plt.xlabel('timestep')\n", + "plt.ylabel('reward')\n", + "plt.grid(True)\n", + "\n", + "# % of optimal choices\n", + "plt.subplot(325)\n", + "plt.plot(avg_optimal) # to get % times optimal choice was chosen\n", + "plt.title('% of optimal choices')\n", + "plt.xlabel('timestep')\n", + "plt.ylabel('% optimal choice')\n", + "plt.grid(True)\n", + "\n", + "plt.subplots_adjust(top=0.99, bottom=.0, left=0.10, right=0.95, hspace=0.5,\n", + " wspace=0.5)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 45c8c2310202143baaacc95984e465a8b1c0dd3c Mon Sep 17 00:00:00 2001 From: Ryan Enderby Date: Mon, 1 Oct 2018 14:14:49 -0400 Subject: [PATCH 2/2] moving notebook to appropriate folder --- {MDP => Bandits}/k-bandit.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {MDP => Bandits}/k-bandit.ipynb (100%) diff --git a/MDP/k-bandit.ipynb b/Bandits/k-bandit.ipynb similarity index 100% rename from MDP/k-bandit.ipynb rename to Bandits/k-bandit.ipynb