cleaning up some errors

fedden · May 20, 2020 · 2bbb7e9 · 2bbb7e9
1 parent cd48a73
commit 2bbb7e9
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 179 deletions.
diff --git a/research/test_methodology/RT.py b/research/test_methodology/RT.py
@@ -1,27 +1,25 @@
+from typing import List
+
 import dill as pickle
 
-from RT_cfr import *
-from pluribus.games.short_deck.state import *
-from pluribus.games.short_deck.agent import *
+from RT_cfr import train
+from pluribus.games.short_deck.agent import TrainedAgent
 from pluribus.poker.card import Card
 
 
 if __name__ == "__main__":
-    # public_cards = [Card("ace", "spades"), Card("queen", "spades"), Card("queen", "hearts")]
-    public_cards = []
+    # public_cards = [Card("ace", "spades"), Card("queen", "spades"),
+    #   Card("queen", "hearts")]
+    public_cards: List[Card] = []
     # we load a (trained) strategy
     agent1 = TrainedAgent("../blueprint_algo/results_2020_05_10_21_36_47_291425")
-    # sorta hacky, but I loaded the average strategy above, now I'm replacing with
     # the better strategy
     # offline_strategy = joblib.load('/Users/colin/Downloads/offline_strategy_285800.gz')
-    # print(sys.getsizeof(offline_strategy))
-    # agent1.offline_strategy = offline_strategy
-    # print(sys.getsizeof(agent1.offline_strategy))
     action_sequence = ["raise", "call", "call", "call", "call"]
     agent_output = train(
         agent1.offline_strategy, public_cards, action_sequence, 40, 6, 6, 3, 2, 6
-    )  # TODO: back to 50
-    with open("realtime-strategy-refactor-deck.pkl", "wb") as file:
+    )
+    with open("testing2.pkl", "wb") as file:
         pickle.dump(agent_output, file)
     import ipdb
     ipdb.set_trace()
diff --git a/research/test_methodology/RT_cfr.py b/research/test_methodology/RT_cfr.py
@@ -1,199 +1,41 @@
-"""
-"""
 from __future__ import annotations
 
 import logging
-
-logging.basicConfig(filename="test.txt", level=logging.DEBUG)
+import sys
 
 from tqdm import trange
+import numpy as np
 
 from pluribus import utils
-from pluribus.games.short_deck.state import *
-from pluribus.games.short_deck.agent import *
+from pluribus.games.short_deck.state import ShortDeckPokerState, new_game
+from pluribus.games.short_deck.agent import Agent
+sys.path.append('../blueprint_algo')
+from blueprint_short_deck_poker import calculate_strategy, cfr, cfrp
 
 
 def update_strategy(agent: Agent, state: ShortDeckPokerState, ph_test_node: int):
     """
-
-    :param state: the game state
-    :param i: the player, i = 1 is always first to act and i = 2 is always second to act, but they take turns who
-        updates the strategy (only one strategy)
-    :return: nothing, updates action count in the strategy of actions chosen according to sigma, this simple choosing of
-        actions is what allows the algorithm to build up preference for one action over another in a given spot
+    Update strategy for test node only
     """
-    logging.debug("UPDATE STRATEGY")
-    logging.debug("########")
-
-    logging.debug(f"P(h): {state.player_i}")
-    logging.debug(f"Betting Round {state._betting_stage}")
-    logging.debug(f"Community Cards {state._table.community_cards}")
-    logging.debug(f"Player 0 hole cards: {state.players[0].cards}")
-    logging.debug(f"Player 1 hole cards: {state.players[1].cards}")
-    logging.debug(f"Player 2 hole cards: {state.players[2].cards}")
-    logging.debug(f"Betting Action Correct?: {state.players}")
-
-    ph = state.player_i  # this is always the case no matter what i is
-
+    ph = state.player_i
     if ph == ph_test_node:
-        try:
-            I = state.info_set
-        except:
-            import ipdb
-
-            ipdb.set_trace()
+        I = state.info_set
         # calculate regret
-        logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}")
-        logging.debug(f"Current regret: {agent.regret[I]}")
         sigma = calculate_strategy(agent.regret, I, state)
-        logging.debug(f"Calculated Strategy for {I}: {sigma[I]}")
         # choose an action based of sigma
         try:
             a = np.random.choice(list(sigma[I].keys()), 1, p=list(sigma[I].values()))[0]
-            logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
         except ValueError:
             p = 1 / len(state.legal_actions)
             probabilities = np.full(len(state.legal_actions), p)
             a = np.random.choice(state.legal_actions, p=probabilities)
             sigma[I] = {action: p for action in state.legal_actions}
-            logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
         # Increment the action counter.
         agent.strategy[I][a] += 1
-        logging.debug(f"Updated Strategy for {I}: {agent.strategy[I]}")
         return
     else:
         return
 
-
-def calculate_strategy(
-    regret: Dict[str, Dict[str, float]], I: str, state: ShortDeckPokerState,
-):
-    """
-
-    :param regret: dictionary of regrets, I is key, then each action at I, with values being regret
-    :param sigma: dictionary of strategy updated by regret, iteration is key, then I is key, then each action with prob
-    :param I:
-    :param state: the game state
-    :return: doesn't return anything, just updates sigma
-    """
-    sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3))
-    rsum = sum([max(x, 0) for x in regret[I].values()])
-    for a in state.legal_actions:
-        if rsum > 0:
-            sigma[I][a] = max(regret[I][a], 0) / rsum
-        else:
-            sigma[I][a] = 1 / len(state.legal_actions)
-    return sigma
-
-
-def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
-    """
-    regular cfr algo
-
-    :param state: the game state
-    :param i: player
-    :param t: iteration
-    :return: expected value for node for player i
-    """
-    logging.debug("CFR")
-    logging.debug("########")
-    logging.debug(f"Iteration: {t}")
-    logging.debug(f"Player Set to Update Regret: {i}")
-    logging.debug(f"P(h): {state.player_i}")
-    logging.debug(f"P(h) Updating Regret? {state.player_i == i}")
-    logging.debug(f"Betting Round {state._betting_stage}")
-    logging.debug(f"Community Cards {state._table.community_cards}")
-    logging.debug(f"Player 0 hole cards: {state.players[0].cards}")
-    logging.debug(f"Player 1 hole cards: {state.players[1].cards}")
-    logging.debug(f"Player 2 hole cards: {state.players[2].cards}")
-    logging.debug(f"Betting Action Correct?: {state.players}")
-
-    ph = state.player_i
-
-    player_not_in_hand = not state.players[i].is_active
-    if state.is_terminal or player_not_in_hand:
-        return state.payout[i]
-
-    # NOTE(fedden): The logic in Algorithm 1 in the supplementary material
-    #               instructs the following lines of logic, but state class
-    #               will already skip to the next in-hand player.
-    # elif p_i not in hand:
-    #   cfr()
-    # NOTE(fedden): According to Algorithm 1 in the supplementary material,
-    #               we would add in the following bit of logic. However we
-    #               already have the game logic embedded in the state class,
-    #               and this accounts for the chance samplings. In other words,
-    #               it makes sure that chance actions such as dealing cards
-    #               happen at the appropriate times.
-    # elif h is chance_node:
-    #   sample action from strategy for h
-    #   cfr()
-
-    elif ph == i:
-        try:
-            I = state.info_set
-        except:
-            import ipdb
-
-            ipdb.set_trace()
-        # calculate strategy
-        logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}")
-        logging.debug(f"Current regret: {agent.regret[I]}")
-        sigma = calculate_strategy(agent.regret, I, state)
-        logging.debug(f"Calculated Strategy for {I}: {sigma[I]}")
-
-        vo = 0.0
-        voa = {}
-        for a in state.legal_actions:
-            logging.debug(
-                f"ACTION TRAVERSED FOR REGRET:  ph {state.player_i} ACTION: {a}"
-            )
-            new_state: ShortDeckPokerState = state.apply_action(a)
-            voa[a] = cfr(agent, new_state, i, t)
-            logging.debug(f"Got EV for {a}: {voa[a]}")
-            vo += sigma[I][a] * voa[a]
-            logging.debug(
-                f"""Added to Node EV for ACTION: {a} INFOSET: {I}
-                STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}"""
-            )
-        logging.debug(f"Updated EV at {I}: {vo}")
-
-        for a in state.legal_actions:
-            agent.regret[I][a] += voa[a] - vo
-        logging.debug(f"Updated Regret at {I}: {agent.regret[I]}")
-
-        return vo
-    else:
-        # import ipdb;
-        # ipdb.set_trace()
-        try:
-            Iph = state.info_set
-        except:
-            import ipdb
-
-            ipdb.set_trace()
-        logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[Iph]}")
-        logging.debug(f"Current regret: {agent.regret[Iph]}")
-        sigma = calculate_strategy(agent.regret, Iph, state)
-        logging.debug(f"Calculated Strategy for {Iph}: {sigma[Iph]}")
-
-        try:
-            a = np.random.choice(
-                list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()),
-            )[0]
-            logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
-
-        except ValueError:
-            p = 1 / len(state.legal_actions)
-            probabilities = np.full(len(state.legal_actions), p)
-            a = np.random.choice(state.legal_actions, p=probabilities)
-            sigma[Iph] = {action: p for action in state.legal_actions}
-            logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
-
-        new_state: ShortDeckPokerState = state.apply_action(a)
-        return cfr(agent, new_state, i, t)
-
-
 def train(
     offline_strategy: Dict,
     public_cards: list,
@@ -206,10 +48,12 @@ def train(
     update_threshold: int,
 ):
     """Train agent."""
+    # TODO: fix the seed
     utils.random.seed(36)
     agent = Agent()
 
-    state: ShortDeckPokerState = new_game(3, real_time_test=True, public_cards=public_cards)
+    state: ShortDeckPokerState = new_game(3, real_time_test=True,
+                                          public_cards=public_cards)
     current_game_state: ShortDeckPokerState = state.load_game_state(
         offline_strategy,
         action_sequence