forked from TikhonJelvis/RL-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_monte_carlo.py
49 lines (38 loc) · 1.45 KB
/
test_monte_carlo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import unittest
import random
from rl.distribution import Categorical, Choose
from rl.function_approx import Tabular
import rl.iterate as iterate
from rl.markov_process import FiniteMarkovRewardProcess, NonTerminal
import rl.monte_carlo as mc
class FlipFlop(FiniteMarkovRewardProcess[bool]):
'''A version of FlipFlop implemented with the FiniteMarkovProcess
machinery.
'''
def __init__(self, p: float):
transition_reward_map = {
b: Categorical({(not b, 2.0): p, (b, 1.0): 1 - p})
for b in (True, False)
}
super().__init__(transition_reward_map)
class TestEvaluate(unittest.TestCase):
def setUp(self):
random.seed(42)
self.finite_flip_flop = FlipFlop(0.7)
def test_evaluate_finite_mrp(self):
start = Tabular({s: 0.0 for s in
self.finite_flip_flop.non_terminal_states})
traces = self.finite_flip_flop.reward_traces(Choose({
NonTerminal(True),
NonTerminal(False)
}))
v = iterate.converged(
mc.mc_prediction(traces, γ=0.99, approx_0=start),
# Loose bound of 0.01 to speed up test.
done=lambda a, b: a.within(b, 0.01)
)
self.assertEqual(len(v.values_map), 2)
for s in v.values_map:
# Intentionally loose bound—otherwise test is too slow.
# Takes >1s on my machine otherwise.
self.assertLess(abs(v(s) - 170), 1.0)