diff --git a/pettingzoo/classic/rlcard_envs/texas_holdem.py b/pettingzoo/classic/rlcard_envs/texas_holdem.py index 0e5699b11..c4fbeb1bd 100644 --- a/pettingzoo/classic/rlcard_envs/texas_holdem.py +++ b/pettingzoo/classic/rlcard_envs/texas_holdem.py @@ -9,7 +9,7 @@ This environment is part of the classic environments. Please read that page first for general information. -| Import | `from pettingzoo.classic import texas_holdem_v4` | +| Import | `from pettingzoo.classic import texas_holdem_v5` | |--------------------|--------------------------------------------------| | Actions | Discrete | | Parallel API | Yes | @@ -25,10 +25,13 @@ ## Arguments ``` python -texas_holdem_v4.env(num_players=2) +texas_holdem_v5.env(num_players=2) ``` `num_players`: Sets the number of players in the game. Minimum is 2. +`num_rounds`: Sets the environment to run for a number of rounds before terminating. If this is set, `num_chips` must also be set. +`num_chips`: Defines the amount of starting chips that each agent in the game has, this must be used in conjunction with num_rounds. + ### Observation Space @@ -69,6 +72,7 @@ ### Version History +* v5: Added multiround option (1.24.2) * v4: Upgrade to RLCard 1.0.3 (1.11.0) * v3: Fixed bug in arbitrary calls to observe() (1.8.0) * v2: Bumped RLCard version, bug fixes, legal action mask in observation replaced illegal move list in infos (1.5.0) @@ -108,17 +112,30 @@ def get_font(path, size): def env(**kwargs): + # optionally chained games with number of chips + num_rounds = kwargs.pop("num_rounds", False) + num_chips = kwargs.pop("num_chips", False) + assert bool(num_rounds) == bool( + num_chips + ), f"If `num_rounds` is used, `num_chips` must also be declared, got {num_rounds=}, {num_chips=}" + env = raw_env(**kwargs) env = wrappers.TerminateIllegalWrapper(env, illegal_reward=-1) env = wrappers.AssertOutOfBoundsWrapper(env) env = wrappers.OrderEnforcingWrapper(env) + + if num_rounds: + env = wrappers.MultiEpisodeEnv( + env, num_episodes=num_rounds, starting_utility=num_chips + ) + return env class raw_env(RLCardBase, EzPickle): metadata = { "render_modes": ["human", "rgb_array"], - "name": "texas_holdem_v4", + "name": "texas_holdem_v5", "is_parallelizable": False, "render_fps": 1, } diff --git a/pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py b/pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py index 59101de69..ba17dd640 100644 --- a/pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py +++ b/pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py @@ -9,7 +9,7 @@ This environment is part of the classic environments. Please read that page first for general information. -| Import | `from pettingzoo.classic import texas_holdem_no_limit_v6` | +| Import | `from pettingzoo.classic import texas_holdem_no_limit_v7` | |--------------------|-----------------------------------------------------------| | Actions | Discrete | | Parallel API | Yes | @@ -29,7 +29,7 @@ ### Arguments ``` python -texas_holdem_no_limit_v6.env(num_players=2) +texas_holdem_no_limit_v7.env(num_players=2) ``` `num_players`: Sets the number of players in the game. Minimum is 2. @@ -79,6 +79,7 @@ ### Version History +* v7: Added multiround option (1.24.2) * v6: Upgrade to RLCard 1.0.5, fixes to the action space as ACPC (1.12.0) * v5: Upgrade to RLCard 1.0.4, fixes to rewards with greater than 2 players (1.11.1) * v4: Upgrade to RLCard 1.0.3 (1.11.0) @@ -121,17 +122,30 @@ def get_font(path, size): def env(**kwargs): + # optionally chained games with number of chips + num_rounds = kwargs.pop("num_rounds", False) + num_chips = kwargs.pop("num_chips", False) + assert bool(num_rounds) == bool( + num_chips + ), f"If `num_rounds` is used, `num_chips` must also be declared, got {num_rounds=}, {num_chips=}" + env = raw_env(**kwargs) env = wrappers.TerminateIllegalWrapper(env, illegal_reward=-1) env = wrappers.AssertOutOfBoundsWrapper(env) env = wrappers.OrderEnforcingWrapper(env) + + if num_rounds: + env = wrappers.MultiEpisodeEnv( + env, num_episodes=num_rounds, starting_utility=num_chips + ) + return env class raw_env(RLCardBase, EzPickle): metadata = { "render_modes": ["human", "rgb_array"], - "name": "texas_holdem_no_limit_v6", + "name": "texas_holdem_no_limit_v7", "is_parallelizable": False, "render_fps": 1, } diff --git a/pettingzoo/classic/texas_holdem_no_limit_v6.py b/pettingzoo/classic/texas_holdem_no_limit_v7.py similarity index 100% rename from pettingzoo/classic/texas_holdem_no_limit_v6.py rename to pettingzoo/classic/texas_holdem_no_limit_v7.py diff --git a/pettingzoo/classic/texas_holdem_v4.py b/pettingzoo/classic/texas_holdem_v5.py similarity index 100% rename from pettingzoo/classic/texas_holdem_v4.py rename to pettingzoo/classic/texas_holdem_v5.py diff --git a/pettingzoo/test/api_test.py b/pettingzoo/test/api_test.py index f8718579c..f0ea19636 100644 --- a/pettingzoo/test/api_test.py +++ b/pettingzoo/test/api_test.py @@ -68,8 +68,8 @@ def action_mask(): list and observation_spaces, action_spaces dictionaries.""" env_obs_dicts = [ "leduc_holdem_v4", - "texas_holdem_no_limit_v6", - "texas_holdem_v4", + "texas_holdem_no_limit_v7", + "texas_holdem_v5", "go_v5", "chess_v6", "connect_four_v3", @@ -89,8 +89,8 @@ def action_mask(): env_all_zeros_obs = ["knights_archers_zombies_v10"] env_obs_space = [ "leduc_holdem_v4", - "texas_holdem_no_limit_v6", - "texas_holdem_v4", + "texas_holdem_no_limit_v7", + "texas_holdem_v5", "go_v5", "hanabi_v5", "knights_archers_zombies_v10", diff --git a/pettingzoo/utils/all_modules.py b/pettingzoo/utils/all_modules.py index e979ed4e6..530d8b2bf 100644 --- a/pettingzoo/utils/all_modules.py +++ b/pettingzoo/utils/all_modules.py @@ -37,8 +37,8 @@ hanabi_v5, leduc_holdem_v4, rps_v2, - texas_holdem_no_limit_v6, - texas_holdem_v4, + texas_holdem_no_limit_v7, + texas_holdem_v5, tictactoe_v3, ) from pettingzoo.mpe import ( @@ -93,8 +93,8 @@ "classic/connect_four_v3": connect_four_v3, "classic/tictactoe_v3": tictactoe_v3, "classic/leduc_holdem_v4": leduc_holdem_v4, - "classic/texas_holdem_v4": texas_holdem_v4, - "classic/texas_holdem_no_limit_v6": texas_holdem_no_limit_v6, + "classic/texas_holdem_v5": texas_holdem_v5, + "classic/texas_holdem_no_limit_v7": texas_holdem_no_limit_v7, "classic/gin_rummy_v4": gin_rummy_v4, "classic/go_v5": go_v5, "classic/hanabi_v5": hanabi_v5, diff --git a/pettingzoo/utils/wrappers/multi_episode_env.py b/pettingzoo/utils/wrappers/multi_episode_env.py index d924a0c45..c7fcd4bbd 100644 --- a/pettingzoo/utils/wrappers/multi_episode_env.py +++ b/pettingzoo/utils/wrappers/multi_episode_env.py @@ -13,14 +13,22 @@ class MultiEpisodeEnv(BaseWrapper): When there are no more valid agents in the underlying environment, the environment is automatically reset. After `num_episodes` have been run internally, the environment terminates normally. The result of this wrapper is that the environment is no longer Markovian around the environment reset. + + When `starting_utility` is used, all agents start with a base amount of health points (think of this as poker chips). + Whenever the agent gets a negative reward, this value is subtracted from starting utility. + Whenever the agent gets a positive reward, it is added to the starting utility. + Agents which run out of starting utility are terminated. """ - def __init__(self, env: AECEnv, num_episodes: int): + def __init__( + self, env: AECEnv, num_episodes: int, starting_utility: float | None = None + ): """__init__. Args: env (AECEnv): env num_episodes (int): num_episodes + starting_utility (Optional[float]): starting_utility """ assert isinstance( env, AECEnv @@ -28,6 +36,7 @@ def __init__(self, env: AECEnv, num_episodes: int): super().__init__(env) self._num_episodes = num_episodes + self._starting_utility = starting_utility def reset(self, seed: int | None = None, options: dict | None = None) -> None: """reset. @@ -39,10 +48,12 @@ def reset(self, seed: int | None = None, options: dict | None = None) -> None: Returns: None: """ + super().reset(seed=seed, options=options) self._episodes_elapsed = 1 self._seed = copy.deepcopy(seed) self._options = copy.deepcopy(options) - super().reset(seed=seed, options=options) + if self._starting_utility: + self._agent_utilities = {a: self._starting_utility for a in self.agents} def step(self, action: ActionType) -> None: """Steps the underlying environment for `num_episodes`. @@ -59,6 +70,18 @@ def step(self, action: ActionType) -> None: None: """ super().step(action) + + # adjust utilities if this param is enabled + if self._starting_utility: + for agent in self.agents: + self._agent_utilities[agent] = ( + self._agent_utilities[agent] + self.rewards[agent] + ) + + if self._agent_utilities[agent] <= 0: + self.terminations[agent] = True + + # if we still have agents, don't need to do anything if self.agents: return diff --git a/pettingzoo/utils/wrappers/multi_episode_parallel_env.py b/pettingzoo/utils/wrappers/multi_episode_parallel_env.py index bbc9c9b60..9a0c83888 100644 --- a/pettingzoo/utils/wrappers/multi_episode_parallel_env.py +++ b/pettingzoo/utils/wrappers/multi_episode_parallel_env.py @@ -13,14 +13,22 @@ class MultiEpisodeParallelEnv(BaseParallelWrapper): When there are no more valid agents in the underlying environment, the environment is automatically reset. When this happens, the `observation` and `info` returned by `step()` are replaced with that of the reset environment. The result of this wrapper is that the environment is no longer Markovian around the environment reset. + + When `starting_utility` is used, all agents start with a base amount of health points (think of this as poker chips). + Whenever the agent gets a negative reward, this value is subtracted from starting utility. + Whenever the agent gets a positive reward, it is added to the starting utility. + Agents which run out of starting utility are terminated. """ - def __init__(self, env: ParallelEnv, num_episodes: int): + def __init__( + self, env: ParallelEnv, num_episodes: int, starting_utility: float | None = None + ): """__init__. Args: env (AECEnv): the base environment num_episodes (int): the number of episodes to run the underlying environment + starting_utility (Optional[float]): starting_utility """ super().__init__(env) assert isinstance( @@ -28,6 +36,7 @@ def __init__(self, env: ParallelEnv, num_episodes: int): ), "MultiEpisodeEnv is only compatible with ParallelEnv environments." self._num_episodes = num_episodes + self._starting_utility = starting_utility def reset( self, seed: int | None = None, options: dict | None = None @@ -46,6 +55,8 @@ def reset( self._seed = copy.deepcopy(seed) self._options = copy.deepcopy(options) self._episodes_elapsed = 1 + if self._starting_utility: + self._agent_utilities = {a: self._starting_utility for a in self.agents} return obs, info @@ -77,7 +88,17 @@ def step( ]: """ obs, rew, term, trunc, info = super().step(actions) - term = {agent: False for agent in term} + + # handle agent utilities if any + if self._starting_utility: + self._agent_utilities = { + u[a] + r[a] for u, r, a in zip(self._agent_utilities, rew, self.agents) + } + # termination only depends on available utility now + term = {agent: u <= 0 for agent, u in zip(term, self._agent_utilities)} + else: + term = {agent: False for agent in term} + trunc = {agent: False for agent in term} if self.agents: diff --git a/test/all_parameter_combs_test.py b/test/all_parameter_combs_test.py index cff5a7cc3..63bfc6fe5 100644 --- a/test/all_parameter_combs_test.py +++ b/test/all_parameter_combs_test.py @@ -41,8 +41,8 @@ hanabi_v5, leduc_holdem_v4, rps_v2, - texas_holdem_no_limit_v6, - texas_holdem_v4, + texas_holdem_no_limit_v7, + texas_holdem_v5, tictactoe_v3, ) from pettingzoo.mpe import ( @@ -156,11 +156,21 @@ ), ], ["classic/leduc_holdem_v4", leduc_holdem_v4, dict()], - ["classic/texas_holdem_v4", texas_holdem_v4, dict(num_players=3)], - ["classic/texas_holdem_v4", texas_holdem_v4, dict(num_players=4)], - ["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict()], - ["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict(num_players=3)], - ["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict(num_players=4)], + ["classic/texas_holdem_v5", texas_holdem_v5, dict(num_players=3)], + ["classic/texas_holdem_v5", texas_holdem_v5, dict(num_players=4)], + [ + "classic/texas_holdem_v5", + texas_holdem_v5, + dict(num_players=4, num_rounds=3, num_chips=5), + ], + ["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict()], + ["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict(num_players=3)], + ["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict(num_players=4)], + [ + "classic/texas_holdem_no_limit_v7", + texas_holdem_no_limit_v7, + dict(num_players=4, num_rounds=3, num_chips=5), + ], [ "butterfly/knights_archers_zombies_v10", knights_archers_zombies_v10, diff --git a/test/wrapper_test.py b/test/wrapper_test.py index 650fe328b..80e413925 100644 --- a/test/wrapper_test.py +++ b/test/wrapper_test.py @@ -3,7 +3,7 @@ import pytest from pettingzoo.butterfly import pistonball_v6 -from pettingzoo.classic import texas_holdem_no_limit_v6 +from pettingzoo.classic import texas_holdem_no_limit_v7 from pettingzoo.utils.wrappers import MultiEpisodeEnv, MultiEpisodeParallelEnv @@ -16,7 +16,7 @@ def test_multi_episode_env_wrapper(num_episodes: int) -> None: Args: num_episodes: number of episodes to run the MultiEpisodeEnv """ - env = texas_holdem_no_limit_v6.env(num_players=3) + env = texas_holdem_no_limit_v7.env(num_players=3) env = MultiEpisodeEnv(env, num_episodes=num_episodes) env.reset(seed=42) diff --git a/tutorials/LangChain/langchain_example.py b/tutorials/LangChain/langchain_example.py index 5b88bfcc5..05aeb4587 100644 --- a/tutorials/LangChain/langchain_example.py +++ b/tutorials/LangChain/langchain_example.py @@ -48,9 +48,9 @@ def tic_tac_toe(): def texas_holdem_no_limit(): - from pettingzoo.classic import texas_holdem_no_limit_v6 + from pettingzoo.classic import texas_holdem_no_limit_v7 - env = texas_holdem_no_limit_v6.env(num_players=4, render_mode="human") + env = texas_holdem_no_limit_v7.env(num_players=4, render_mode="human") agents = { name: ActionMaskAgent(name=name, model=ChatOpenAI(temperature=0.2), env=env) for name in env.possible_agents diff --git a/tutorials/SB3/test/test_sb3_action_mask.py b/tutorials/SB3/test/test_sb3_action_mask.py index 3835af393..978b31398 100644 --- a/tutorials/SB3/test/test_sb3_action_mask.py +++ b/tutorials/SB3/test/test_sb3_action_mask.py @@ -8,8 +8,8 @@ go_v5, hanabi_v5, leduc_holdem_v4, - texas_holdem_no_limit_v6, - texas_holdem_v4, + texas_holdem_no_limit_v7, + texas_holdem_v5, tictactoe_v3, ) @@ -22,8 +22,8 @@ # These environments do better than random even after the minimum number of timesteps EASY_ENVS = [ gin_rummy_v4, - texas_holdem_no_limit_v6, # texas holdem human rendered game ends instantly, but with random actions it works fine - texas_holdem_v4, + texas_holdem_no_limit_v7, # texas holdem human rendered game ends instantly, but with random actions it works fine + texas_holdem_v5, ] # More difficult environments which will likely take more training time