Skip to content
This repository has been archived by the owner on Jul 16, 2024. It is now read-only.

Commit

Permalink
cleaning up some errors
Browse files Browse the repository at this point in the history
  • Loading branch information
big-c-note committed May 20, 2020
1 parent cd48a73 commit 2bbb7e9
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 179 deletions.
20 changes: 9 additions & 11 deletions research/test_methodology/RT.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,25 @@
from typing import List

import dill as pickle

from RT_cfr import *
from pluribus.games.short_deck.state import *
from pluribus.games.short_deck.agent import *
from RT_cfr import train
from pluribus.games.short_deck.agent import TrainedAgent
from pluribus.poker.card import Card


if __name__ == "__main__":
# public_cards = [Card("ace", "spades"), Card("queen", "spades"), Card("queen", "hearts")]
public_cards = []
# public_cards = [Card("ace", "spades"), Card("queen", "spades"),
# Card("queen", "hearts")]
public_cards: List[Card] = []
# we load a (trained) strategy
agent1 = TrainedAgent("../blueprint_algo/results_2020_05_10_21_36_47_291425")
# sorta hacky, but I loaded the average strategy above, now I'm replacing with
# the better strategy
# offline_strategy = joblib.load('/Users/colin/Downloads/offline_strategy_285800.gz')
# print(sys.getsizeof(offline_strategy))
# agent1.offline_strategy = offline_strategy
# print(sys.getsizeof(agent1.offline_strategy))
action_sequence = ["raise", "call", "call", "call", "call"]
agent_output = train(
agent1.offline_strategy, public_cards, action_sequence, 40, 6, 6, 3, 2, 6
) # TODO: back to 50
with open("realtime-strategy-refactor-deck.pkl", "wb") as file:
)
with open("testing2.pkl", "wb") as file:
pickle.dump(agent_output, file)
import ipdb
ipdb.set_trace()
180 changes: 12 additions & 168 deletions research/test_methodology/RT_cfr.py
Original file line number Diff line number Diff line change
@@ -1,199 +1,41 @@
"""
"""
from __future__ import annotations

import logging

logging.basicConfig(filename="test.txt", level=logging.DEBUG)
import sys

from tqdm import trange
import numpy as np

from pluribus import utils
from pluribus.games.short_deck.state import *
from pluribus.games.short_deck.agent import *
from pluribus.games.short_deck.state import ShortDeckPokerState, new_game
from pluribus.games.short_deck.agent import Agent
sys.path.append('../blueprint_algo')
from blueprint_short_deck_poker import calculate_strategy, cfr, cfrp


def update_strategy(agent: Agent, state: ShortDeckPokerState, ph_test_node: int):
"""
:param state: the game state
:param i: the player, i = 1 is always first to act and i = 2 is always second to act, but they take turns who
updates the strategy (only one strategy)
:return: nothing, updates action count in the strategy of actions chosen according to sigma, this simple choosing of
actions is what allows the algorithm to build up preference for one action over another in a given spot
Update strategy for test node only
"""
logging.debug("UPDATE STRATEGY")
logging.debug("########")

logging.debug(f"P(h): {state.player_i}")
logging.debug(f"Betting Round {state._betting_stage}")
logging.debug(f"Community Cards {state._table.community_cards}")
logging.debug(f"Player 0 hole cards: {state.players[0].cards}")
logging.debug(f"Player 1 hole cards: {state.players[1].cards}")
logging.debug(f"Player 2 hole cards: {state.players[2].cards}")
logging.debug(f"Betting Action Correct?: {state.players}")

ph = state.player_i # this is always the case no matter what i is

ph = state.player_i
if ph == ph_test_node:
try:
I = state.info_set
except:
import ipdb

ipdb.set_trace()
I = state.info_set
# calculate regret
logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}")
logging.debug(f"Current regret: {agent.regret[I]}")
sigma = calculate_strategy(agent.regret, I, state)
logging.debug(f"Calculated Strategy for {I}: {sigma[I]}")
# choose an action based of sigma
try:
a = np.random.choice(list(sigma[I].keys()), 1, p=list(sigma[I].values()))[0]
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
except ValueError:
p = 1 / len(state.legal_actions)
probabilities = np.full(len(state.legal_actions), p)
a = np.random.choice(state.legal_actions, p=probabilities)
sigma[I] = {action: p for action in state.legal_actions}
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
# Increment the action counter.
agent.strategy[I][a] += 1
logging.debug(f"Updated Strategy for {I}: {agent.strategy[I]}")
return
else:
return


def calculate_strategy(
regret: Dict[str, Dict[str, float]], I: str, state: ShortDeckPokerState,
):
"""
:param regret: dictionary of regrets, I is key, then each action at I, with values being regret
:param sigma: dictionary of strategy updated by regret, iteration is key, then I is key, then each action with prob
:param I:
:param state: the game state
:return: doesn't return anything, just updates sigma
"""
sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3))
rsum = sum([max(x, 0) for x in regret[I].values()])
for a in state.legal_actions:
if rsum > 0:
sigma[I][a] = max(regret[I][a], 0) / rsum
else:
sigma[I][a] = 1 / len(state.legal_actions)
return sigma


def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
"""
regular cfr algo
:param state: the game state
:param i: player
:param t: iteration
:return: expected value for node for player i
"""
logging.debug("CFR")
logging.debug("########")
logging.debug(f"Iteration: {t}")
logging.debug(f"Player Set to Update Regret: {i}")
logging.debug(f"P(h): {state.player_i}")
logging.debug(f"P(h) Updating Regret? {state.player_i == i}")
logging.debug(f"Betting Round {state._betting_stage}")
logging.debug(f"Community Cards {state._table.community_cards}")
logging.debug(f"Player 0 hole cards: {state.players[0].cards}")
logging.debug(f"Player 1 hole cards: {state.players[1].cards}")
logging.debug(f"Player 2 hole cards: {state.players[2].cards}")
logging.debug(f"Betting Action Correct?: {state.players}")

ph = state.player_i

player_not_in_hand = not state.players[i].is_active
if state.is_terminal or player_not_in_hand:
return state.payout[i]

# NOTE(fedden): The logic in Algorithm 1 in the supplementary material
# instructs the following lines of logic, but state class
# will already skip to the next in-hand player.
# elif p_i not in hand:
# cfr()
# NOTE(fedden): According to Algorithm 1 in the supplementary material,
# we would add in the following bit of logic. However we
# already have the game logic embedded in the state class,
# and this accounts for the chance samplings. In other words,
# it makes sure that chance actions such as dealing cards
# happen at the appropriate times.
# elif h is chance_node:
# sample action from strategy for h
# cfr()

elif ph == i:
try:
I = state.info_set
except:
import ipdb

ipdb.set_trace()
# calculate strategy
logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}")
logging.debug(f"Current regret: {agent.regret[I]}")
sigma = calculate_strategy(agent.regret, I, state)
logging.debug(f"Calculated Strategy for {I}: {sigma[I]}")

vo = 0.0
voa = {}
for a in state.legal_actions:
logging.debug(
f"ACTION TRAVERSED FOR REGRET: ph {state.player_i} ACTION: {a}"
)
new_state: ShortDeckPokerState = state.apply_action(a)
voa[a] = cfr(agent, new_state, i, t)
logging.debug(f"Got EV for {a}: {voa[a]}")
vo += sigma[I][a] * voa[a]
logging.debug(
f"""Added to Node EV for ACTION: {a} INFOSET: {I}
STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}"""
)
logging.debug(f"Updated EV at {I}: {vo}")

for a in state.legal_actions:
agent.regret[I][a] += voa[a] - vo
logging.debug(f"Updated Regret at {I}: {agent.regret[I]}")

return vo
else:
# import ipdb;
# ipdb.set_trace()
try:
Iph = state.info_set
except:
import ipdb

ipdb.set_trace()
logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[Iph]}")
logging.debug(f"Current regret: {agent.regret[Iph]}")
sigma = calculate_strategy(agent.regret, Iph, state)
logging.debug(f"Calculated Strategy for {Iph}: {sigma[Iph]}")

try:
a = np.random.choice(
list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()),
)[0]
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")

except ValueError:
p = 1 / len(state.legal_actions)
probabilities = np.full(len(state.legal_actions), p)
a = np.random.choice(state.legal_actions, p=probabilities)
sigma[Iph] = {action: p for action in state.legal_actions}
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")

new_state: ShortDeckPokerState = state.apply_action(a)
return cfr(agent, new_state, i, t)


def train(
offline_strategy: Dict,
public_cards: list,
Expand All @@ -206,10 +48,12 @@ def train(
update_threshold: int,
):
"""Train agent."""
# TODO: fix the seed
utils.random.seed(36)
agent = Agent()

state: ShortDeckPokerState = new_game(3, real_time_test=True, public_cards=public_cards)
state: ShortDeckPokerState = new_game(3, real_time_test=True,
public_cards=public_cards)
current_game_state: ShortDeckPokerState = state.load_game_state(
offline_strategy,
action_sequence
Expand Down

0 comments on commit 2bbb7e9

Please sign in to comment.