Farama-Foundation · jjshoots · May 6, 2023 · May 6, 2023 · May 6, 2023 · May 7, 2023
diff --git a/pettingzoo/classic/rlcard_envs/texas_holdem.py b/pettingzoo/classic/rlcard_envs/texas_holdem.py
@@ -9,7 +9,7 @@
 
 This environment is part of the <a href='..'>classic environments</a>. Please read that page first for general information.
 
-| Import             | `from pettingzoo.classic import texas_holdem_v4` |
+| Import             | `from pettingzoo.classic import texas_holdem_v5` |
 |--------------------|--------------------------------------------------|
 | Actions            | Discrete                                         |
 | Parallel API       | Yes                                              |
@@ -25,10 +25,13 @@
 ## Arguments
 
 ``` python
-texas_holdem_v4.env(num_players=2)
+texas_holdem_v5.env(num_players=2)
 ```
 
 `num_players`: Sets the number of players in the game. Minimum is 2.
+`num_rounds`: Sets the environment to run for a number of rounds before terminating. If this is set, `num_chips` must also be set.
+`num_chips`: Defines the amount of starting chips that each agent in the game has, this must be used in conjunction with num_rounds.
+
 
 ### Observation Space
 
@@ -69,6 +72,7 @@
 
 ### Version History
 
+* v5: Added multiround option (1.24.2)
 * v4: Upgrade to RLCard 1.0.3 (1.11.0)
 * v3: Fixed bug in arbitrary calls to observe() (1.8.0)
 * v2: Bumped RLCard version, bug fixes, legal action mask in observation replaced illegal move list in infos (1.5.0)
@@ -108,17 +112,30 @@ def get_font(path, size):
 
 
 def env(**kwargs):
+    # optionally chained games with number of chips
+    num_rounds = kwargs.pop("num_rounds", False)
+    num_chips = kwargs.pop("num_chips", False)
+    assert bool(num_rounds) == bool(
+        num_chips
+    ), f"If `num_rounds` is used, `num_chips` must also be declared, got {num_rounds=}, {num_chips=}"
+
     env = raw_env(**kwargs)
     env = wrappers.TerminateIllegalWrapper(env, illegal_reward=-1)
     env = wrappers.AssertOutOfBoundsWrapper(env)
     env = wrappers.OrderEnforcingWrapper(env)
+
+    if num_rounds:
+        env = wrappers.MultiEpisodeEnv(
+            env, num_episodes=num_rounds, starting_utility=num_chips
+        )
+
     return env
 
 
 class raw_env(RLCardBase, EzPickle):
     metadata = {
         "render_modes": ["human", "rgb_array"],
-        "name": "texas_holdem_v4",
+        "name": "texas_holdem_v5",
         "is_parallelizable": False,
         "render_fps": 1,
     }

diff --git a/pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py b/pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py
@@ -9,7 +9,7 @@
 
 This environment is part of the <a href='..'>classic environments</a>. Please read that page first for general information.
 
-| Import             | `from pettingzoo.classic import texas_holdem_no_limit_v6` |
+| Import             | `from pettingzoo.classic import texas_holdem_no_limit_v7` |
 |--------------------|-----------------------------------------------------------|
 | Actions            | Discrete                                                  |
 | Parallel API       | Yes                                                       |
@@ -29,7 +29,7 @@
 ### Arguments
 
 ``` python
-texas_holdem_no_limit_v6.env(num_players=2)
+texas_holdem_no_limit_v7.env(num_players=2)
 ```
 
 `num_players`: Sets the number of players in the game. Minimum is 2.
@@ -79,6 +79,7 @@
 
 ### Version History
 
+* v7: Added multiround option (1.24.2)
 * v6: Upgrade to RLCard 1.0.5, fixes to the action space as ACPC (1.12.0)
 * v5: Upgrade to RLCard 1.0.4, fixes to rewards with greater than 2 players (1.11.1)
 * v4: Upgrade to RLCard 1.0.3 (1.11.0)
@@ -121,17 +122,30 @@ def get_font(path, size):
 
 
 def env(**kwargs):
+    # optionally chained games with number of chips
+    num_rounds = kwargs.pop("num_rounds", False)
+    num_chips = kwargs.pop("num_chips", False)
+    assert bool(num_rounds) == bool(
+        num_chips
+    ), f"If `num_rounds` is used, `num_chips` must also be declared, got {num_rounds=}, {num_chips=}"
+
     env = raw_env(**kwargs)
     env = wrappers.TerminateIllegalWrapper(env, illegal_reward=-1)
     env = wrappers.AssertOutOfBoundsWrapper(env)
     env = wrappers.OrderEnforcingWrapper(env)
+
+    if num_rounds:
+        env = wrappers.MultiEpisodeEnv(
+            env, num_episodes=num_rounds, starting_utility=num_chips
+        )
+
     return env
 
 
 class raw_env(RLCardBase, EzPickle):
     metadata = {
         "render_modes": ["human", "rgb_array"],
-        "name": "texas_holdem_no_limit_v6",
+        "name": "texas_holdem_no_limit_v7",
         "is_parallelizable": False,
         "render_fps": 1,
     }

diff --git a/...ngzoo/classic/texas_holdem_no_limit_v6.py → ...ngzoo/classic/texas_holdem_no_limit_v7.py b/...ngzoo/classic/texas_holdem_no_limit_v6.py → ...ngzoo/classic/texas_holdem_no_limit_v7.py
diff --git a/pettingzoo/classic/texas_holdem_v4.py → pettingzoo/classic/texas_holdem_v5.py b/pettingzoo/classic/texas_holdem_v4.py → pettingzoo/classic/texas_holdem_v5.py
diff --git a/pettingzoo/test/api_test.py b/pettingzoo/test/api_test.py
@@ -68,8 +68,8 @@ def action_mask():
 list and observation_spaces, action_spaces dictionaries."""
 env_obs_dicts = [
     "leduc_holdem_v4",
-    "texas_holdem_no_limit_v6",
-    "texas_holdem_v4",
+    "texas_holdem_no_limit_v7",
+    "texas_holdem_v5",
     "go_v5",
     "chess_v6",
     "connect_four_v3",
@@ -89,8 +89,8 @@ def action_mask():
 env_all_zeros_obs = ["knights_archers_zombies_v10"]
 env_obs_space = [
     "leduc_holdem_v4",
-    "texas_holdem_no_limit_v6",
-    "texas_holdem_v4",
+    "texas_holdem_no_limit_v7",
+    "texas_holdem_v5",
     "go_v5",
     "hanabi_v5",
     "knights_archers_zombies_v10",

diff --git a/pettingzoo/utils/all_modules.py b/pettingzoo/utils/all_modules.py
@@ -37,8 +37,8 @@
     hanabi_v5,
     leduc_holdem_v4,
     rps_v2,
-    texas_holdem_no_limit_v6,
-    texas_holdem_v4,
+    texas_holdem_no_limit_v7,
+    texas_holdem_v5,
     tictactoe_v3,
 )
 from pettingzoo.mpe import (
@@ -93,8 +93,8 @@
     "classic/connect_four_v3": connect_four_v3,
     "classic/tictactoe_v3": tictactoe_v3,
     "classic/leduc_holdem_v4": leduc_holdem_v4,
-    "classic/texas_holdem_v4": texas_holdem_v4,
-    "classic/texas_holdem_no_limit_v6": texas_holdem_no_limit_v6,
+    "classic/texas_holdem_v5": texas_holdem_v5,
+    "classic/texas_holdem_no_limit_v7": texas_holdem_no_limit_v7,
     "classic/gin_rummy_v4": gin_rummy_v4,
     "classic/go_v5": go_v5,
     "classic/hanabi_v5": hanabi_v5,

diff --git a/pettingzoo/utils/wrappers/multi_episode_env.py b/pettingzoo/utils/wrappers/multi_episode_env.py
@@ -13,21 +13,30 @@ class MultiEpisodeEnv(BaseWrapper):
     When there are no more valid agents in the underlying environment, the environment is automatically reset.
     After `num_episodes` have been run internally, the environment terminates normally.
     The result of this wrapper is that the environment is no longer Markovian around the environment reset.
+
+    When `starting_utility` is used, all agents start with a base amount of health points (think of this as poker chips).
+    Whenever the agent gets a negative reward, this value is subtracted from starting utility.
+    Whenever the agent gets a positive reward, it is added to the starting utility.
+    Agents which run out of starting utility are terminated.
     """
 
-    def __init__(self, env: AECEnv, num_episodes: int):
+    def __init__(
+        self, env: AECEnv, num_episodes: int, starting_utility: float | None = None
+    ):
         """__init__.
 
         Args:
             env (AECEnv): env
             num_episodes (int): num_episodes
+            starting_utility (Optional[float]): starting_utility
         """
         assert isinstance(
             env, AECEnv
         ), "MultiEpisodeEnv is only compatible with AEC environments"
         super().__init__(env)
 
         self._num_episodes = num_episodes
+        self._starting_utility = starting_utility
 
     def reset(self, seed: int | None = None, options: dict | None = None) -> None:
         """reset.
@@ -39,10 +48,12 @@ def reset(self, seed: int | None = None, options: dict | None = None) -> None:
         Returns:
             None:
         """
+        super().reset(seed=seed, options=options)
         self._episodes_elapsed = 1
         self._seed = copy.deepcopy(seed)
         self._options = copy.deepcopy(options)
-        super().reset(seed=seed, options=options)
+        if self._starting_utility:
+            self._agent_utilities = {a: self._starting_utility for a in self.agents}
 
     def step(self, action: ActionType) -> None:
         """Steps the underlying environment for `num_episodes`.
@@ -59,6 +70,18 @@ def step(self, action: ActionType) -> None:
             None:
         """
         super().step(action)
+
+        # adjust utilities if this param is enabled
+        if self._starting_utility:
+            for agent in self.agents:
+                self._agent_utilities[agent] = (
+                    self._agent_utilities[agent] + self.rewards[agent]
+                )
+
+                if self._agent_utilities[agent] <= 0:
+                    self.terminations[agent] = True
+
+        # if we still have agents, don't need to do anything
         if self.agents:
             return
 

diff --git a/pettingzoo/utils/wrappers/multi_episode_parallel_env.py b/pettingzoo/utils/wrappers/multi_episode_parallel_env.py
@@ -13,21 +13,30 @@ class MultiEpisodeParallelEnv(BaseParallelWrapper):
     When there are no more valid agents in the underlying environment, the environment is automatically reset.
     When this happens, the `observation` and `info` returned by `step()` are replaced with that of the reset environment.
     The result of this wrapper is that the environment is no longer Markovian around the environment reset.
+
+    When `starting_utility` is used, all agents start with a base amount of health points (think of this as poker chips).
+    Whenever the agent gets a negative reward, this value is subtracted from starting utility.
+    Whenever the agent gets a positive reward, it is added to the starting utility.
+    Agents which run out of starting utility are terminated.
     """
 
-    def __init__(self, env: ParallelEnv, num_episodes: int):
+    def __init__(
+        self, env: ParallelEnv, num_episodes: int, starting_utility: float | None = None
+    ):
         """__init__.
 
         Args:
             env (AECEnv): the base environment
             num_episodes (int): the number of episodes to run the underlying environment
+            starting_utility (Optional[float]): starting_utility
         """
         super().__init__(env)
         assert isinstance(
             env, ParallelEnv
         ), "MultiEpisodeEnv is only compatible with ParallelEnv environments."
 
         self._num_episodes = num_episodes
+        self._starting_utility = starting_utility
 
     def reset(
         self, seed: int | None = None, options: dict | None = None
@@ -46,6 +55,8 @@ def reset(
         self._seed = copy.deepcopy(seed)
         self._options = copy.deepcopy(options)
         self._episodes_elapsed = 1
+        if self._starting_utility:
+            self._agent_utilities = {a: self._starting_utility for a in self.agents}
 
         return obs, info
 
@@ -77,7 +88,17 @@ def step(
             ]:
         """
         obs, rew, term, trunc, info = super().step(actions)
-        term = {agent: False for agent in term}
+
+        # handle agent utilities if any
+        if self._starting_utility:
+            self._agent_utilities = {
+                u[a] + r[a] for u, r, a in zip(self._agent_utilities, rew, self.agents)
+            }
+            # termination only depends on available utility now
+            term = {agent: u <= 0 for agent, u in zip(term, self._agent_utilities)}
+        else:
+            term = {agent: False for agent in term}
+
         trunc = {agent: False for agent in term}
 
         if self.agents:

diff --git a/test/all_parameter_combs_test.py b/test/all_parameter_combs_test.py
@@ -41,8 +41,8 @@
     hanabi_v5,
     leduc_holdem_v4,
     rps_v2,
-    texas_holdem_no_limit_v6,
-    texas_holdem_v4,
+    texas_holdem_no_limit_v7,
+    texas_holdem_v5,
     tictactoe_v3,
 )
 from pettingzoo.mpe import (
@@ -156,11 +156,21 @@
         ),
     ],
     ["classic/leduc_holdem_v4", leduc_holdem_v4, dict()],
-    ["classic/texas_holdem_v4", texas_holdem_v4, dict(num_players=3)],
-    ["classic/texas_holdem_v4", texas_holdem_v4, dict(num_players=4)],
-    ["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict()],
-    ["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict(num_players=3)],
-    ["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict(num_players=4)],
+    ["classic/texas_holdem_v5", texas_holdem_v5, dict(num_players=3)],
+    ["classic/texas_holdem_v5", texas_holdem_v5, dict(num_players=4)],
+    [
+        "classic/texas_holdem_v5",
+        texas_holdem_v5,
+        dict(num_players=4, num_rounds=3, num_chips=5),
+    ],
+    ["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict()],
+    ["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict(num_players=3)],
+    ["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict(num_players=4)],
+    [
+        "classic/texas_holdem_no_limit_v7",
+        texas_holdem_no_limit_v7,
+        dict(num_players=4, num_rounds=3, num_chips=5),
+    ],
     [
         "butterfly/knights_archers_zombies_v10",
         knights_archers_zombies_v10,

diff --git a/test/wrapper_test.py b/test/wrapper_test.py
@@ -3,7 +3,7 @@
 import pytest
 
 from pettingzoo.butterfly import pistonball_v6
-from pettingzoo.classic import texas_holdem_no_limit_v6
+from pettingzoo.classic import texas_holdem_no_limit_v7
 from pettingzoo.utils.wrappers import MultiEpisodeEnv, MultiEpisodeParallelEnv
 
 
@@ -16,7 +16,7 @@ def test_multi_episode_env_wrapper(num_episodes: int) -> None:
     Args:
         num_episodes: number of episodes to run the MultiEpisodeEnv
     """
-    env = texas_holdem_no_limit_v6.env(num_players=3)
+    env = texas_holdem_no_limit_v7.env(num_players=3)
     env = MultiEpisodeEnv(env, num_episodes=num_episodes)
     env.reset(seed=42)
 

diff --git a/tutorials/LangChain/langchain_example.py b/tutorials/LangChain/langchain_example.py
@@ -48,9 +48,9 @@ def tic_tac_toe():
 
 
 def texas_holdem_no_limit():
-    from pettingzoo.classic import texas_holdem_no_limit_v6
+    from pettingzoo.classic import texas_holdem_no_limit_v7
 
-    env = texas_holdem_no_limit_v6.env(num_players=4, render_mode="human")
+    env = texas_holdem_no_limit_v7.env(num_players=4, render_mode="human")
     agents = {
         name: ActionMaskAgent(name=name, model=ChatOpenAI(temperature=0.2), env=env)
         for name in env.possible_agents

diff --git a/tutorials/SB3/test/test_sb3_action_mask.py b/tutorials/SB3/test/test_sb3_action_mask.py
@@ -8,8 +8,8 @@
     go_v5,
     hanabi_v5,
     leduc_holdem_v4,
-    texas_holdem_no_limit_v6,
-    texas_holdem_v4,
+    texas_holdem_no_limit_v7,
+    texas_holdem_v5,
     tictactoe_v3,
 )
 
@@ -22,8 +22,8 @@
 # These environments do better than random even after the minimum number of timesteps
 EASY_ENVS = [
     gin_rummy_v4,
-    texas_holdem_no_limit_v6,  # texas holdem human rendered game ends instantly, but with random actions it works fine
-    texas_holdem_v4,
+    texas_holdem_no_limit_v7,  # texas holdem human rendered game ends instantly, but with random actions it works fine
+    texas_holdem_v5,
 ]
 
 # More difficult environments which will likely take more training time