Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

env(lisong): add beergame supply chain optimization env #512

Merged
merged 14 commits into from
Nov 7, 2022
Merged
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo`
| 29 |[dmc2gym](https://github.com/denisyarats/dmc2gym) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/dmc2gym/dmc2gym_cheetah.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/dmc2gym)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/dmc2gym.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/dmc2gym_zh.html) |
| 30 |[evogym](https://github.com/EvolutionGym/evogym) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/evogym/evogym.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/evogym/envs)<br>环境指南 |
| 31 |[gym-pybullet-drones](https://github.com/utiasDSL/gym-pybullet-drones) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/gym-pybullet-drones/gym-pybullet-drones.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/gym_pybullet_drones/envs)<br>环境指南 |
| 32 |[beergame](https://github.com/OptMLGroup/DeepBeerInventory-RL) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/beergame/beergame.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/beergame/envs)<br>环境指南 |

![discrete](https://img.shields.io/badge/-discrete-brightgreen) means discrete action space

Expand Down
20 changes: 19 additions & 1 deletion ding/envs/env_manager/base_env_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,9 @@ def done(self) -> bool:

@property
def method_name_list(self) -> list:
return ['reset', 'step', 'seed', 'close', 'enable_save_replay', 'render']
return [
'reset', 'step', 'seed', 'close', 'enable_save_replay', 'render', 'reward_shaping', 'enable_save_figure'
]

def env_state_done(self, env_id: int) -> bool:
return self._env_states[env_id] == EnvState.DONE
Expand Down Expand Up @@ -418,6 +420,19 @@ def enable_save_replay(self, replay_path: Union[List[str], str]) -> None:
replay_path = [replay_path] * self.env_num
self._env_replay_path = replay_path

def enable_save_figure(self, env_id: int, figure_path: Union[List[str], str]) -> None:
"""
Overview:
Set each env's replay save path.
Arguments:
- replay_path (:obj:`Union[List[str], str]`): List of paths for each environment; \
Or one path for all environments.
"""
if isinstance(figure_path, str):
self._env[env_id].enable_save_figure(figure_path)
song2181 marked this conversation as resolved.
Show resolved Hide resolved
else:
raise TypeError("invalid figure_path arguments type: {}".format(type(figure_path)))

def close(self) -> None:
"""
Overview:
Expand All @@ -431,6 +446,9 @@ def close(self) -> None:
self._env_states[i] = EnvState.VOID
self._closed = True

def reward_shaping(self, env_id: int, transitions: List[dict]) -> List[dict]:
return self._envs[env_id].reward_shaping(transitions)

@property
def closed(self) -> bool:
return self._closed
Expand Down
6 changes: 5 additions & 1 deletion ding/worker/collector/episode_serial_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ class EpisodeSerialCollector(ISerialCollector):
envstep
"""

config = dict(deepcopy_obs=False, transform_obs=False, collect_print_freq=100, get_train_sample=False)
config = dict(
deepcopy_obs=False, transform_obs=False, collect_print_freq=100, get_train_sample=False, reward_shaping=False
)

def __init__(
self,
Expand Down Expand Up @@ -251,6 +253,8 @@ def collect(self,
# prepare data
if timestep.done:
transitions = to_tensor_transitions(self._traj_buffer[env_id])
if self._cfg.reward_shaping:
self._env.reward_shaping(env_id, transitions)
if self._cfg.get_train_sample:
train_sample = self._policy.get_train_sample(transitions)
return_data.extend(train_sample)
Expand Down
3 changes: 3 additions & 0 deletions ding/worker/collector/interaction_serial_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,9 @@ def eval(
continue
if t.done:
# Env reset is done by env_manager automatically.
if 'figure_path' in self._cfg:
if self._cfg.figure_path is not None:
self._env.enable_save_figure(env_id, self._cfg.figure_path)
self._policy.reset([env_id])
reward = t.info['final_eval_reward']
if 'episode_info' in t.info:
Expand Down
Binary file added dizoo/beergame/beergame.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
70 changes: 70 additions & 0 deletions dizoo/beergame/config/beergame_onppo_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from easydict import EasyDict

beergame_ppo_config = dict(
exp_name='beergame_ppo_seed0',
env=dict(
collector_env_num=8,
evaluator_env_num=8,
n_evaluator_episode=8,
stop_value=200,
role=0, # 0-3 : retailer, warehouse, distributor, manufacturer
agent_type='bs',
# type of co-player, 'bs'- base stock, 'Strm'- use Sterman formula to model typical human behavior
demandDistribution=0
# distribution of demand, default=0, '0=uniform, 1=normal distribution, 2=the sequence of 4,4,4,4,8,..., 3= basket data, 4= forecast data'
),
policy=dict(
cuda=True,
recompute_adv=True,
action_space='discrete',
model=dict(
obs_shape=50, # statedim * multPerdInpt= 5 * 10
action_shape=5, # the quantity relative to the arriving order
action_space='discrete',
encoder_hidden_size_list=[64, 64, 128],
actor_head_hidden_size=128,
critic_head_hidden_size=128,
),
learn=dict(
epoch_per_collect=10,
batch_size=320,
learning_rate=3e-4,
entropy_weight=0.001,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must
# use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
ignore_done=True,
),
collect=dict(
n_episode=8,
discount_factor=0.99,
gae_lambda=0.95,
collector=dict(
get_train_sample=True,
reward_shaping=True, # whether use total reward to reshape reward
),
),
eval=dict(evaluator=dict(eval_freq=500, )),
),
)
beergame_ppo_config = EasyDict(beergame_ppo_config)
main_config = beergame_ppo_config
beergame_ppo_create_config = dict(
env=dict(
type='beergame',
import_names=['dizoo.beergame.envs.beergame_env'],
),
env_manager=dict(type='base'),
policy=dict(type='ppo'),
collector=dict(type='episode', ),
)
beergame_ppo_create_config = EasyDict(beergame_ppo_create_config)
create_config = beergame_ppo_create_config

if __name__ == "__main__":
# or you can enter `ding -m serial_onpolicy -c beergame_onppo_config.py -s 0`
from ding.entry import serial_pipeline_onpolicy
serial_pipeline_onpolicy([main_config, create_config], seed=0)
42 changes: 42 additions & 0 deletions dizoo/beergame/entry/beergame_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import torch
from tensorboardX import SummaryWriter

from ding.config import compile_config
from ding.worker import InteractionSerialEvaluator
from ding.envs import BaseEnvManager
from ding.policy import PPOPolicy
from ding.model import VAC
from ding.utils import set_pkg_seed
from dizoo.beergame.config.beergame_onppo_config import beergame_ppo_config, beergame_ppo_create_config
from ding.envs import get_vec_env_setting
from functools import partial


def main(cfg, seed=0):
env_fn = None
cfg, create_cfg = beergame_ppo_config, beergame_ppo_create_config
cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True)
collector_env_num, evaluator_env_num = cfg.env.collector_env_num, cfg.env.evaluator_env_num

env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env)
cfg.env.manager.auto_reset = False
evaluator_env = BaseEnvManager(env_fn=[partial(env_fn, cfg=c) for c in evaluator_env_cfg], cfg=cfg.env.manager)
evaluator_env.seed(seed, dynamic_seed=False)
set_pkg_seed(seed, use_cuda=cfg.policy.cuda)
model = VAC(**cfg.policy.model)
tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial'))
policy = PPOPolicy(cfg.policy, model=model)
# set the path to save figure
cfg.policy.eval.evaluator.figure_path = './'
evaluator = InteractionSerialEvaluator(
cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name
)
# load model
model.load_state_dict(torch.load('model path', map_location='cpu')["model"])
evaluator.eval(None, -1, -1)


if __name__ == "__main__":
beergame_ppo_config.exp_name = 'beergame_evaluate'
main(beergame_ppo_config)
152 changes: 152 additions & 0 deletions dizoo/beergame/envs/BGAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Code Reference: https://github.com/OptMLGroup/DeepBeerInventory-RL.
import argparse
import numpy as np
song2181 marked this conversation as resolved.
Show resolved Hide resolved


# Here we want to define the agent class for the BeerGame
class Agent(object):
# initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents.
def __init__(
self, agentNum: int, IL: int, AO: int, AS: int, c_h: float, c_p: float, eta: int, compuType: str,
config: argparse.Namespace
) -> None:
self.agentNum = agentNum
self.IL = IL # Inventory level of each agent - changes during the game
self.OO = 0 # Open order of each agent - changes during the game
self.ASInitial = AS # the initial arriving shipment.
self.ILInitial = IL # IL at which we start each game with this number
self.AOInitial = AO # OO at which we start each game with this number
self.config = config # an instance of config is stored inside the class
self.curState = [] # this function gets the current state of the game
self.nextState = []
self.curReward = 0 # the reward observed at the current step
self.cumReward = 0 # cumulative reward; reset at the begining of each episode
self.totRew = 0 # it is reward of all players obtained for the current player.
self.c_h = c_h # holding cost
self.c_p = c_p # backorder cost
self.eta = eta # the total cost regulazer
self.AS = np.zeros((1, 1)) # arriced shipment
self.AO = np.zeros((1, 1)) # arrived order
self.action = 0 # the action at time t
self.compType = compuType
# self.compTypeTrain = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
# self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
self.alpha_b = self.config.alpha_b[self.agentNum] # parameters for the formula
self.betta_b = self.config.betta_b[self.agentNum] # parameters for the formula
if self.config.demandDistribution == 0:
self.a_b = np.mean((self.config.demandUp, self.config.demandLow)) # parameters for the formula
self.b_b = np.mean((self.config.demandUp, self.config.demandLow)) * (
np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
) # parameters for the formula
elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4:
self.a_b = self.config.demandMu # parameters for the formula
self.b_b = self.config.demandMu * (
np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
) # parameters for the formula
elif self.config.demandDistribution == 2:
self.a_b = 8 # parameters for the formula
self.b_b = (3 / 4.) * 8 * (
np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
) # parameters for the formula
elif self.config.demandDistribution == 3:
self.a_b = 10 # parameters for the formula
self.b_b = 7 * (
np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
) # parameters for the formula
song2181 marked this conversation as resolved.
Show resolved Hide resolved
else:
raise Exception('The demand distribution is not defined or it is not a valid type.!')

self.hist = [] # this is used for plotting - keeps the history for only one game
self.hist2 = [] # this is used for animation usage
self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
self.T = 0
self.bsBaseStock = 0
self.init_bsBaseStock = 0
self.nextObservation = []

if self.compType == 'srdqn':
# sets the initial input of the network
self.currentState = np.stack(
[self.curState for _ in range(self.config.multPerdInpt)], axis=0
) # multPerdInpt observations stacked. each row is an observation

# reset player information
def resetPlayer(self, T: int):
self.IL = self.ILInitial
self.OO = 0
self.AS = np.squeeze(
np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
) # arriced shipment
self.AO = np.squeeze(
np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
) # arrived order
if self.agentNum != 0:
for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]):
self.AO[i] = self.AOInitial[self.agentNum - 1]
for i in range(self.config.leadRecItemUp[self.agentNum]):
self.AS[i] = self.ASInitial
self.curReward = 0 # the reward observed at the current step
self.cumReward = 0 # cumulative reward; reset at the begining of each episode
self.action = []
self.hist = []
self.hist2 = []
self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
self.T = T
self.curObservation = self.getCurState(1) # this function gets the current state of the game
self.nextObservation = []
if self.compType == 'srdqn':
self.currentState = np.stack([self.curObservation for _ in range(self.config.multPerdInpt)], axis=0)

# updates the IL and OO at time t, after recieving "rec" number of items
def recieveItems(self, time: int) -> None:
self.IL = self.IL + self.AS[time] # inverntory level update
self.OO = self.OO - self.AS[time] # invertory in transient update

# find action Value associated with the action list
def actionValue(self, curTime: int) -> int:
if self.config.fixedAction:
a = self.config.actionList[np.argmax(self.action)]
else:
# "d + x" rule
if self.compType == 'srdqn':
a = max(0, self.config.actionList[np.argmax(self.action)] * self.config.action_step + self.AO[curTime])
elif self.compType == 'rnd':
a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
else:
a = max(0, self.config.actionListOpt[np.argmax(self.action)])

return a

# getReward returns the reward at the current state
def getReward(self) -> None:
# cost (holding + backorder) for one time unit
self.curReward = (self.c_p * max(0, -self.IL) + self.c_h * max(0, self.IL)) / 200. # self.config.Ttest #
self.curReward = -self.curReward
# make reward negative, because it is the cost

# sum total reward of each agent
self.cumReward = self.config.gamma * self.cumReward + self.curReward

# This function returns a np.array of the current state of the agent
def getCurState(self, t: int) -> np.ndarray:
if self.config.ifUseASAO:
if self.config.if_use_AS_t_plus_1:
curState = np.array(
[-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t], self.AO[t]]
)
else:
curState = np.array(
[-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t - 1], self.AO[t]]
)
else:
curState = np.array([-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO])

if self.config.ifUseActionInD:
a = self.config.actionList[np.argmax(self.action)]
curState = np.concatenate((curState, np.array([a])))

return curState
2 changes: 2 additions & 0 deletions dizoo/beergame/envs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .clBeergame import clBeerGame
from .beergame_core import BeerGame
Loading