Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add num_rounds and num_chips to poker envs #1109

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions pettingzoo/classic/rlcard_envs/texas_holdem.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

This environment is part of the <a href='..'>classic environments</a>. Please read that page first for general information.

| Import | `from pettingzoo.classic import texas_holdem_v4` |
| Import | `from pettingzoo.classic import texas_holdem_v5` |
|--------------------|--------------------------------------------------|
| Actions | Discrete |
| Parallel API | Yes |
Expand All @@ -25,10 +25,13 @@
## Arguments

``` python
texas_holdem_v4.env(num_players=2)
texas_holdem_v5.env(num_players=2)
```

`num_players`: Sets the number of players in the game. Minimum is 2.
`num_rounds`: Sets the environment to run for a number of rounds before terminating. If this is set, `num_chips` must also be set.
`num_chips`: Defines the amount of starting chips that each agent in the game has, this must be used in conjunction with num_rounds.


### Observation Space

Expand Down Expand Up @@ -69,6 +72,7 @@

### Version History

* v5: Added multiround option (1.24.2)
* v4: Upgrade to RLCard 1.0.3 (1.11.0)
* v3: Fixed bug in arbitrary calls to observe() (1.8.0)
* v2: Bumped RLCard version, bug fixes, legal action mask in observation replaced illegal move list in infos (1.5.0)
Expand Down Expand Up @@ -108,17 +112,30 @@ def get_font(path, size):


def env(**kwargs):
# optionally chained games with number of chips
num_rounds = kwargs.pop("num_rounds", False)
num_chips = kwargs.pop("num_chips", False)
assert bool(num_rounds) == bool(
num_chips
), f"If `num_rounds` is used, `num_chips` must also be declared, got {num_rounds=}, {num_chips=}"

env = raw_env(**kwargs)
env = wrappers.TerminateIllegalWrapper(env, illegal_reward=-1)
env = wrappers.AssertOutOfBoundsWrapper(env)
env = wrappers.OrderEnforcingWrapper(env)

if num_rounds:
env = wrappers.MultiEpisodeEnv(
env, num_episodes=num_rounds, starting_utility=num_chips
)

return env


class raw_env(RLCardBase, EzPickle):
metadata = {
"render_modes": ["human", "rgb_array"],
"name": "texas_holdem_v4",
"name": "texas_holdem_v5",
"is_parallelizable": False,
"render_fps": 1,
}
Expand Down
20 changes: 17 additions & 3 deletions pettingzoo/classic/rlcard_envs/texas_holdem_no_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

This environment is part of the <a href='..'>classic environments</a>. Please read that page first for general information.

| Import | `from pettingzoo.classic import texas_holdem_no_limit_v6` |
| Import | `from pettingzoo.classic import texas_holdem_no_limit_v7` |
|--------------------|-----------------------------------------------------------|
| Actions | Discrete |
| Parallel API | Yes |
Expand All @@ -29,7 +29,7 @@
### Arguments

``` python
texas_holdem_no_limit_v6.env(num_players=2)
texas_holdem_no_limit_v7.env(num_players=2)
```

`num_players`: Sets the number of players in the game. Minimum is 2.
Expand Down Expand Up @@ -79,6 +79,7 @@

### Version History

* v7: Added multiround option (1.24.2)
* v6: Upgrade to RLCard 1.0.5, fixes to the action space as ACPC (1.12.0)
* v5: Upgrade to RLCard 1.0.4, fixes to rewards with greater than 2 players (1.11.1)
* v4: Upgrade to RLCard 1.0.3 (1.11.0)
Expand Down Expand Up @@ -121,17 +122,30 @@ def get_font(path, size):


def env(**kwargs):
# optionally chained games with number of chips
num_rounds = kwargs.pop("num_rounds", False)
num_chips = kwargs.pop("num_chips", False)
assert bool(num_rounds) == bool(
num_chips
), f"If `num_rounds` is used, `num_chips` must also be declared, got {num_rounds=}, {num_chips=}"

env = raw_env(**kwargs)
env = wrappers.TerminateIllegalWrapper(env, illegal_reward=-1)
env = wrappers.AssertOutOfBoundsWrapper(env)
env = wrappers.OrderEnforcingWrapper(env)

if num_rounds:
env = wrappers.MultiEpisodeEnv(
env, num_episodes=num_rounds, starting_utility=num_chips
)

return env


class raw_env(RLCardBase, EzPickle):
metadata = {
"render_modes": ["human", "rgb_array"],
"name": "texas_holdem_no_limit_v6",
"name": "texas_holdem_no_limit_v7",
"is_parallelizable": False,
"render_fps": 1,
}
Expand Down
8 changes: 4 additions & 4 deletions pettingzoo/test/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def action_mask():
list and observation_spaces, action_spaces dictionaries."""
env_obs_dicts = [
"leduc_holdem_v4",
"texas_holdem_no_limit_v6",
"texas_holdem_v4",
"texas_holdem_no_limit_v7",
"texas_holdem_v5",
"go_v5",
"chess_v6",
"connect_four_v3",
Expand All @@ -89,8 +89,8 @@ def action_mask():
env_all_zeros_obs = ["knights_archers_zombies_v10"]
env_obs_space = [
"leduc_holdem_v4",
"texas_holdem_no_limit_v6",
"texas_holdem_v4",
"texas_holdem_no_limit_v7",
"texas_holdem_v5",
"go_v5",
"hanabi_v5",
"knights_archers_zombies_v10",
Expand Down
8 changes: 4 additions & 4 deletions pettingzoo/utils/all_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
hanabi_v5,
leduc_holdem_v4,
rps_v2,
texas_holdem_no_limit_v6,
texas_holdem_v4,
texas_holdem_no_limit_v7,
texas_holdem_v5,
tictactoe_v3,
)
from pettingzoo.mpe import (
Expand Down Expand Up @@ -93,8 +93,8 @@
"classic/connect_four_v3": connect_four_v3,
"classic/tictactoe_v3": tictactoe_v3,
"classic/leduc_holdem_v4": leduc_holdem_v4,
"classic/texas_holdem_v4": texas_holdem_v4,
"classic/texas_holdem_no_limit_v6": texas_holdem_no_limit_v6,
"classic/texas_holdem_v5": texas_holdem_v5,
"classic/texas_holdem_no_limit_v7": texas_holdem_no_limit_v7,
"classic/gin_rummy_v4": gin_rummy_v4,
"classic/go_v5": go_v5,
"classic/hanabi_v5": hanabi_v5,
Expand Down
27 changes: 25 additions & 2 deletions pettingzoo/utils/wrappers/multi_episode_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,30 @@ class MultiEpisodeEnv(BaseWrapper):
When there are no more valid agents in the underlying environment, the environment is automatically reset.
After `num_episodes` have been run internally, the environment terminates normally.
The result of this wrapper is that the environment is no longer Markovian around the environment reset.

When `starting_utility` is used, all agents start with a base amount of health points (think of this as poker chips).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an interesting way to do it, but I'm not sure if this is the best name for it. Can't really think of anything else besides starting_reward or something. Maybe it could be like total_rewards to indicate that it makes the rewards track between resets. Or tally_rewards. Feel like starting_utility doesn't mean anything to me if I don't know how it works. Would make sense and be simple if it was total_rewards adds or subtracts by round.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think utility makes more sense, this is a pretty standard way of saying "starting amount of meaningful substance", starting reward can be confused for the reward given to the agent for starting a new episode, while tally rewards sounds like it should be a boolean.

Whenever the agent gets a negative reward, this value is subtracted from starting utility.
Whenever the agent gets a positive reward, it is added to the starting utility.
Agents which run out of starting utility are terminated.
"""

def __init__(self, env: AECEnv, num_episodes: int):
def __init__(
self, env: AECEnv, num_episodes: int, starting_utility: float | None = None
):
"""__init__.

Args:
env (AECEnv): env
num_episodes (int): num_episodes
starting_utility (Optional[float]): starting_utility
"""
assert isinstance(
env, AECEnv
), "MultiEpisodeEnv is only compatible with AEC environments"
super().__init__(env)

self._num_episodes = num_episodes
self._starting_utility = starting_utility

def reset(self, seed: int | None = None, options: dict | None = None) -> None:
"""reset.
Expand All @@ -39,10 +48,12 @@ def reset(self, seed: int | None = None, options: dict | None = None) -> None:
Returns:
None:
"""
super().reset(seed=seed, options=options)
self._episodes_elapsed = 1
self._seed = copy.deepcopy(seed)
self._options = copy.deepcopy(options)
super().reset(seed=seed, options=options)
if self._starting_utility:
self._agent_utilities = {a: self._starting_utility for a in self.agents}

def step(self, action: ActionType) -> None:
"""Steps the underlying environment for `num_episodes`.
Expand All @@ -59,6 +70,18 @@ def step(self, action: ActionType) -> None:
None:
"""
super().step(action)

# adjust utilities if this param is enabled
if self._starting_utility:
for agent in self.agents:
self._agent_utilities[agent] = (
self._agent_utilities[agent] + self.rewards[agent]
)

if self._agent_utilities[agent] <= 0:
self.terminations[agent] = True

# if we still have agents, don't need to do anything
if self.agents:
return

Expand Down
25 changes: 23 additions & 2 deletions pettingzoo/utils/wrappers/multi_episode_parallel_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,30 @@ class MultiEpisodeParallelEnv(BaseParallelWrapper):
When there are no more valid agents in the underlying environment, the environment is automatically reset.
When this happens, the `observation` and `info` returned by `step()` are replaced with that of the reset environment.
The result of this wrapper is that the environment is no longer Markovian around the environment reset.

When `starting_utility` is used, all agents start with a base amount of health points (think of this as poker chips).
Whenever the agent gets a negative reward, this value is subtracted from starting utility.
Whenever the agent gets a positive reward, it is added to the starting utility.
Agents which run out of starting utility are terminated.
"""

def __init__(self, env: ParallelEnv, num_episodes: int):
def __init__(
self, env: ParallelEnv, num_episodes: int, starting_utility: float | None = None
):
"""__init__.

Args:
env (AECEnv): the base environment
num_episodes (int): the number of episodes to run the underlying environment
starting_utility (Optional[float]): starting_utility
"""
super().__init__(env)
assert isinstance(
env, ParallelEnv
), "MultiEpisodeEnv is only compatible with ParallelEnv environments."

self._num_episodes = num_episodes
self._starting_utility = starting_utility

def reset(
self, seed: int | None = None, options: dict | None = None
Expand All @@ -46,6 +55,8 @@ def reset(
self._seed = copy.deepcopy(seed)
self._options = copy.deepcopy(options)
self._episodes_elapsed = 1
if self._starting_utility:
self._agent_utilities = {a: self._starting_utility for a in self.agents}

return obs, info

Expand Down Expand Up @@ -77,7 +88,17 @@ def step(
]:
"""
obs, rew, term, trunc, info = super().step(actions)
term = {agent: False for agent in term}

# handle agent utilities if any
if self._starting_utility:
self._agent_utilities = {
u[a] + r[a] for u, r, a in zip(self._agent_utilities, rew, self.agents)
}
# termination only depends on available utility now
term = {agent: u <= 0 for agent, u in zip(term, self._agent_utilities)}
else:
term = {agent: False for agent in term}

trunc = {agent: False for agent in term}

if self.agents:
Expand Down
24 changes: 17 additions & 7 deletions test/all_parameter_combs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
hanabi_v5,
leduc_holdem_v4,
rps_v2,
texas_holdem_no_limit_v6,
texas_holdem_v4,
texas_holdem_no_limit_v7,
texas_holdem_v5,
tictactoe_v3,
)
from pettingzoo.mpe import (
Expand Down Expand Up @@ -156,11 +156,21 @@
),
],
["classic/leduc_holdem_v4", leduc_holdem_v4, dict()],
["classic/texas_holdem_v4", texas_holdem_v4, dict(num_players=3)],
["classic/texas_holdem_v4", texas_holdem_v4, dict(num_players=4)],
["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict()],
["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict(num_players=3)],
["classic/texas_holdem_no_limit_v6", texas_holdem_no_limit_v6, dict(num_players=4)],
["classic/texas_holdem_v5", texas_holdem_v5, dict(num_players=3)],
["classic/texas_holdem_v5", texas_holdem_v5, dict(num_players=4)],
[
"classic/texas_holdem_v5",
texas_holdem_v5,
dict(num_players=4, num_rounds=3, num_chips=5),
],
["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict()],
["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict(num_players=3)],
["classic/texas_holdem_no_limit_v7", texas_holdem_no_limit_v7, dict(num_players=4)],
[
"classic/texas_holdem_no_limit_v7",
texas_holdem_no_limit_v7,
dict(num_players=4, num_rounds=3, num_chips=5),
],
[
"butterfly/knights_archers_zombies_v10",
knights_archers_zombies_v10,
Expand Down
4 changes: 2 additions & 2 deletions test/wrapper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from pettingzoo.butterfly import pistonball_v6
from pettingzoo.classic import texas_holdem_no_limit_v6
from pettingzoo.classic import texas_holdem_no_limit_v7
from pettingzoo.utils.wrappers import MultiEpisodeEnv, MultiEpisodeParallelEnv


Expand All @@ -16,7 +16,7 @@ def test_multi_episode_env_wrapper(num_episodes: int) -> None:
Args:
num_episodes: number of episodes to run the MultiEpisodeEnv
"""
env = texas_holdem_no_limit_v6.env(num_players=3)
env = texas_holdem_no_limit_v7.env(num_players=3)
env = MultiEpisodeEnv(env, num_episodes=num_episodes)
env.reset(seed=42)

Expand Down
4 changes: 2 additions & 2 deletions tutorials/LangChain/langchain_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def tic_tac_toe():


def texas_holdem_no_limit():
from pettingzoo.classic import texas_holdem_no_limit_v6
from pettingzoo.classic import texas_holdem_no_limit_v7

env = texas_holdem_no_limit_v6.env(num_players=4, render_mode="human")
env = texas_holdem_no_limit_v7.env(num_players=4, render_mode="human")
agents = {
name: ActionMaskAgent(name=name, model=ChatOpenAI(temperature=0.2), env=env)
for name in env.possible_agents
Expand Down
8 changes: 4 additions & 4 deletions tutorials/SB3/test/test_sb3_action_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
go_v5,
hanabi_v5,
leduc_holdem_v4,
texas_holdem_no_limit_v6,
texas_holdem_v4,
texas_holdem_no_limit_v7,
texas_holdem_v5,
tictactoe_v3,
)

Expand All @@ -22,8 +22,8 @@
# These environments do better than random even after the minimum number of timesteps
EASY_ENVS = [
gin_rummy_v4,
texas_holdem_no_limit_v6, # texas holdem human rendered game ends instantly, but with random actions it works fine
texas_holdem_v4,
texas_holdem_no_limit_v7, # texas holdem human rendered game ends instantly, but with random actions it works fine
texas_holdem_v5,
]

# More difficult environments which will likely take more training time
Expand Down