diff --git a/research/test_methodology/RT.py b/research/test_methodology/RT.py index b89b5815..388f321e 100644 --- a/research/test_methodology/RT.py +++ b/research/test_methodology/RT.py @@ -1,27 +1,25 @@ +from typing import List + import dill as pickle -from RT_cfr import * -from pluribus.games.short_deck.state import * -from pluribus.games.short_deck.agent import * +from RT_cfr import train +from pluribus.games.short_deck.agent import TrainedAgent from pluribus.poker.card import Card if __name__ == "__main__": - # public_cards = [Card("ace", "spades"), Card("queen", "spades"), Card("queen", "hearts")] - public_cards = [] + # public_cards = [Card("ace", "spades"), Card("queen", "spades"), + # Card("queen", "hearts")] + public_cards: List[Card] = [] # we load a (trained) strategy agent1 = TrainedAgent("../blueprint_algo/results_2020_05_10_21_36_47_291425") - # sorta hacky, but I loaded the average strategy above, now I'm replacing with # the better strategy # offline_strategy = joblib.load('/Users/colin/Downloads/offline_strategy_285800.gz') - # print(sys.getsizeof(offline_strategy)) - # agent1.offline_strategy = offline_strategy - # print(sys.getsizeof(agent1.offline_strategy)) action_sequence = ["raise", "call", "call", "call", "call"] agent_output = train( agent1.offline_strategy, public_cards, action_sequence, 40, 6, 6, 3, 2, 6 - ) # TODO: back to 50 - with open("realtime-strategy-refactor-deck.pkl", "wb") as file: + ) + with open("testing2.pkl", "wb") as file: pickle.dump(agent_output, file) import ipdb ipdb.set_trace() diff --git a/research/test_methodology/RT_cfr.py b/research/test_methodology/RT_cfr.py index 72fad794..07d31d21 100644 --- a/research/test_methodology/RT_cfr.py +++ b/research/test_methodology/RT_cfr.py @@ -1,199 +1,41 @@ -""" -""" from __future__ import annotations import logging - -logging.basicConfig(filename="test.txt", level=logging.DEBUG) +import sys from tqdm import trange +import numpy as np from pluribus import utils -from pluribus.games.short_deck.state import * -from pluribus.games.short_deck.agent import * +from pluribus.games.short_deck.state import ShortDeckPokerState, new_game +from pluribus.games.short_deck.agent import Agent +sys.path.append('../blueprint_algo') +from blueprint_short_deck_poker import calculate_strategy, cfr, cfrp def update_strategy(agent: Agent, state: ShortDeckPokerState, ph_test_node: int): """ - - :param state: the game state - :param i: the player, i = 1 is always first to act and i = 2 is always second to act, but they take turns who - updates the strategy (only one strategy) - :return: nothing, updates action count in the strategy of actions chosen according to sigma, this simple choosing of - actions is what allows the algorithm to build up preference for one action over another in a given spot + Update strategy for test node only """ - logging.debug("UPDATE STRATEGY") - logging.debug("########") - - logging.debug(f"P(h): {state.player_i}") - logging.debug(f"Betting Round {state._betting_stage}") - logging.debug(f"Community Cards {state._table.community_cards}") - logging.debug(f"Player 0 hole cards: {state.players[0].cards}") - logging.debug(f"Player 1 hole cards: {state.players[1].cards}") - logging.debug(f"Player 2 hole cards: {state.players[2].cards}") - logging.debug(f"Betting Action Correct?: {state.players}") - - ph = state.player_i # this is always the case no matter what i is - + ph = state.player_i if ph == ph_test_node: - try: - I = state.info_set - except: - import ipdb - - ipdb.set_trace() + I = state.info_set # calculate regret - logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}") - logging.debug(f"Current regret: {agent.regret[I]}") sigma = calculate_strategy(agent.regret, I, state) - logging.debug(f"Calculated Strategy for {I}: {sigma[I]}") # choose an action based of sigma try: a = np.random.choice(list(sigma[I].keys()), 1, p=list(sigma[I].values()))[0] - logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}") except ValueError: p = 1 / len(state.legal_actions) probabilities = np.full(len(state.legal_actions), p) a = np.random.choice(state.legal_actions, p=probabilities) sigma[I] = {action: p for action in state.legal_actions} - logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}") # Increment the action counter. agent.strategy[I][a] += 1 - logging.debug(f"Updated Strategy for {I}: {agent.strategy[I]}") return else: return - -def calculate_strategy( - regret: Dict[str, Dict[str, float]], I: str, state: ShortDeckPokerState, -): - """ - - :param regret: dictionary of regrets, I is key, then each action at I, with values being regret - :param sigma: dictionary of strategy updated by regret, iteration is key, then I is key, then each action with prob - :param I: - :param state: the game state - :return: doesn't return anything, just updates sigma - """ - sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3)) - rsum = sum([max(x, 0) for x in regret[I].values()]) - for a in state.legal_actions: - if rsum > 0: - sigma[I][a] = max(regret[I][a], 0) / rsum - else: - sigma[I][a] = 1 / len(state.legal_actions) - return sigma - - -def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float: - """ - regular cfr algo - - :param state: the game state - :param i: player - :param t: iteration - :return: expected value for node for player i - """ - logging.debug("CFR") - logging.debug("########") - logging.debug(f"Iteration: {t}") - logging.debug(f"Player Set to Update Regret: {i}") - logging.debug(f"P(h): {state.player_i}") - logging.debug(f"P(h) Updating Regret? {state.player_i == i}") - logging.debug(f"Betting Round {state._betting_stage}") - logging.debug(f"Community Cards {state._table.community_cards}") - logging.debug(f"Player 0 hole cards: {state.players[0].cards}") - logging.debug(f"Player 1 hole cards: {state.players[1].cards}") - logging.debug(f"Player 2 hole cards: {state.players[2].cards}") - logging.debug(f"Betting Action Correct?: {state.players}") - - ph = state.player_i - - player_not_in_hand = not state.players[i].is_active - if state.is_terminal or player_not_in_hand: - return state.payout[i] - - # NOTE(fedden): The logic in Algorithm 1 in the supplementary material - # instructs the following lines of logic, but state class - # will already skip to the next in-hand player. - # elif p_i not in hand: - # cfr() - # NOTE(fedden): According to Algorithm 1 in the supplementary material, - # we would add in the following bit of logic. However we - # already have the game logic embedded in the state class, - # and this accounts for the chance samplings. In other words, - # it makes sure that chance actions such as dealing cards - # happen at the appropriate times. - # elif h is chance_node: - # sample action from strategy for h - # cfr() - - elif ph == i: - try: - I = state.info_set - except: - import ipdb - - ipdb.set_trace() - # calculate strategy - logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}") - logging.debug(f"Current regret: {agent.regret[I]}") - sigma = calculate_strategy(agent.regret, I, state) - logging.debug(f"Calculated Strategy for {I}: {sigma[I]}") - - vo = 0.0 - voa = {} - for a in state.legal_actions: - logging.debug( - f"ACTION TRAVERSED FOR REGRET: ph {state.player_i} ACTION: {a}" - ) - new_state: ShortDeckPokerState = state.apply_action(a) - voa[a] = cfr(agent, new_state, i, t) - logging.debug(f"Got EV for {a}: {voa[a]}") - vo += sigma[I][a] * voa[a] - logging.debug( - f"""Added to Node EV for ACTION: {a} INFOSET: {I} - STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}""" - ) - logging.debug(f"Updated EV at {I}: {vo}") - - for a in state.legal_actions: - agent.regret[I][a] += voa[a] - vo - logging.debug(f"Updated Regret at {I}: {agent.regret[I]}") - - return vo - else: - # import ipdb; - # ipdb.set_trace() - try: - Iph = state.info_set - except: - import ipdb - - ipdb.set_trace() - logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[Iph]}") - logging.debug(f"Current regret: {agent.regret[Iph]}") - sigma = calculate_strategy(agent.regret, Iph, state) - logging.debug(f"Calculated Strategy for {Iph}: {sigma[Iph]}") - - try: - a = np.random.choice( - list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()), - )[0] - logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}") - - except ValueError: - p = 1 / len(state.legal_actions) - probabilities = np.full(len(state.legal_actions), p) - a = np.random.choice(state.legal_actions, p=probabilities) - sigma[Iph] = {action: p for action in state.legal_actions} - logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}") - - new_state: ShortDeckPokerState = state.apply_action(a) - return cfr(agent, new_state, i, t) - - def train( offline_strategy: Dict, public_cards: list, @@ -206,10 +48,12 @@ def train( update_threshold: int, ): """Train agent.""" + # TODO: fix the seed utils.random.seed(36) agent = Agent() - state: ShortDeckPokerState = new_game(3, real_time_test=True, public_cards=public_cards) + state: ShortDeckPokerState = new_game(3, real_time_test=True, + public_cards=public_cards) current_game_state: ShortDeckPokerState = state.load_game_state( offline_strategy, action_sequence