From 68e66759c26e5f3974ae7015253c995431629431 Mon Sep 17 00:00:00 2001 From: laurcate Date: Fri, 31 Mar 2023 10:28:51 +0200 Subject: [PATCH 1/6] beam-search pr --- notebooks/planner_byo_example.ipynb | 156 ++++++++++++++++++++++++++-- requirements.txt | 2 + src/a2rl/simulator.py | 95 +++++++++++++++++ test/test_simulator.py | 46 +++++++- 4 files changed, 289 insertions(+), 10 deletions(-) diff --git a/notebooks/planner_byo_example.ipynb b/notebooks/planner_byo_example.ipynb index a2210cc..c1b6703 100644 --- a/notebooks/planner_byo_example.ipynb +++ b/notebooks/planner_byo_example.ipynb @@ -214,12 +214,14 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Define two planners\n", + "## Define three planners\n", "\n", - "Here we define two planner classes as examples to illustrate how to Bring Your Own planner to work with the `A2RL` simulator. We will add more planners (e.g. `BeamSearchPlanner`, etc.) as needed as per your feedback." + "Here we define three planner classes as examples to illustrate how to Bring Your Own planner to work with the `A2RL` simulator.\n", + "" ] }, { @@ -435,7 +437,112 @@ "\n", " q_accum_cost_list = q_accum_cost_list.transpose()\n", " q_non_accum_cost_list = q_non_accum_cost_list.transpose()\n", - " return [q_non_accum_cost_list, q_accum_cost_list]\n" + " return [q_non_accum_cost_list, q_accum_cost_list]\n", + "\n", + "\n", + "class BeamSearchQPlanner(A2RLPLanner):\n", + " \"\"\"\n", + " This planner has similar logic to the QPlanner, only it uses `a2rl.Simulator.beam_search_n_steps`\n", + " to obtain all the actions and rewards in one go.\n", + " The actions are still chosen with the highest / lowest sum_reward (immediate_reward + reward-to-go), \n", + " and take that action to the next step.\n", + " \"\"\"\n", + "\n", + " def __init__(self, simulator: Simulator, beam_width: int, beam_random: bool, objective: str = 'min') -> None:\n", + " super().__init__(simulator)\n", + "\n", + " self.beam_width = beam_width\n", + " self.beam_random = beam_random\n", + "\n", + " if objective.lower() not in ['min', 'max']:\n", + " raise ValueError('objective must be either min or max')\n", + " if 'min' == objective:\n", + " self.obj_op = np.argmin\n", + " else:\n", + " self.obj_op = np.argmax\n", + "\n", + " def rollout(self, horizon: int = 20, nb_runs: int = 3) -> List[np.array]:\n", + " if nb_runs != 1:\n", + " print(\"WARN: multiple runs in beam search is implemented as a loop and not vectorized and performance may be slow\")\n", + "\n", + " if nb_runs != 1 and not self.beam_random:\n", + " raise ValueError(\"'beam_random' should be True when using multiple runs\")\n", + "\n", + " dataframe_per_run = []\n", + " non_accum_rewards_list = []\n", + " accum_rewards_list = []\n", + "\n", + " initial_context = self.tokenizer.df_tokenized.iloc[0, : self.tokenizer.state_dim].values\n", + "\n", + " for i_run in range(nb_runs):\n", + " non_accum_rewards = []\n", + "\n", + " if initial_context.ndim != 1:\n", + " raise NotImplementedError(\"batching not implemented\")\n", + "\n", + " # Overwite some tokens here if you need\n", + " overwrite_valid_tokens = {}\n", + "\n", + " # Generate A+R+S tokens each time\n", + " context = initial_context\n", + " n_steps = self.tokenizer.action_dim + self.tokenizer.reward_dim + self.tokenizer.state_dim\n", + "\n", + " for i in tqdm(range(horizon)):\n", + " new_context, accum_logprobs = self.simulator.beam_search_n_steps(\n", + " seq=context,\n", + " n_steps=n_steps,\n", + " beam_width=self.beam_width,\n", + " randomness=self.beam_random,\n", + " overwrite_valid_tokens=overwrite_valid_tokens,\n", + " return_logprobs=True,\n", + " )\n", + "\n", + " ars_tokens = new_context[:, len(context) :]\n", + " df_ars = wi.WiDataFrame(\n", + " ars_tokens,\n", + " **self.tokenizer.df_tokenized.sar_d,\n", + " columns=[\n", + " *self.tokenizer.action_columns,\n", + " *self.tokenizer.reward_columns,\n", + " *self.tokenizer.state_columns,\n", + " ],\n", + " )\n", + "\n", + " df_sar = df_ars[df_ars.sar]\n", + " df_sar = self.tokenizer.field_tokenizer.inverse_transform(df_sar)\n", + "\n", + " rewards = df_sar[self.tokenizer.reward_columns].values\n", + " best_idx = self.obj_op(rewards.sum(axis=1))\n", + " non_accum_rewards.append(rewards[best_idx, 0])\n", + "\n", + " context = new_context[best_idx]\n", + "\n", + " # Uncomment the following if you want to record a dataframe per run\n", + " # widf_searched = wi.WiDataFrame(\n", + " # context[len(initial_context) :].reshape(horizon, -1),\n", + " # **self.tokenizer.df_tokenized.sar_d,\n", + " # columns=[\n", + " # *self.tokenizer.df_tokenized.actions,\n", + " # *self.tokenizer.df_tokenized.rewards,\n", + " # *self.tokenizer.df_tokenized.states,\n", + " # ],\n", + " # )\n", + " # widf_searched = widf_searched[widf_searched.sar]\n", + " # widf_searched = self.tokenizer.field_tokenizer.inverse_transform(widf_searched)\n", + " # widf_searched[\"nb_run\"] = i_run\n", + " # widf_searched[\"timestep\"] = range(1, len(widf_searched) + 1)\n", + " # dataframe_per_run.append(widf_searched)\n", + "\n", + " non_accum_rewards = np.array(non_accum_rewards)\n", + " accum_rewards = np.cumsum(non_accum_rewards, axis=0)\n", + "\n", + " non_accum_rewards_list.append(non_accum_rewards)\n", + " accum_rewards_list.append(accum_rewards)\n", + "\n", + " non_accum_rewards_list = np.array(non_accum_rewards_list)\n", + " accum_rewards_list = np.array(accum_rewards_list)\n", + "\n", + " return [non_accum_rewards_list, accum_rewards_list]\n" ] }, { @@ -473,12 +580,33 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Compare the costs (`system_power_consumption`) between two planners\n", + "### Create and run the `BeamSearchQPlanner` " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bsqp = BeamSearchQPlanner(simulator, beam_width=8, beam_random=True)\n", + "bsq_non_accum_cost_list, bsq_accum_cost_list = bsqp.rollout(horizon, nb_runs)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare the costs (`system_power_consumption`) between three planners\n", + "\n", + "On average (in the sense of **expected** outcome), the `Q-value Maximisation` planner (`QPlanner` for short) produces relatively lower `system_power_consumption`. However, the `Bahaviour Clone` actions may occasionally perform equally well. This is due to the non-deterministic nature of both the *Simulator* when performing `simulator.lookahead()` and the randomness associated with `simulator.sample()`. Moreover, the GPT model associated with the *Simulator* in this example was not trained sufficiently in terms of both the number of epochs and the size of the training data.\n", "\n", - "On average (in the sense of **expected** outcome), the `Q-value Maximisation` planner (`QPlanner` for short) produces relatively lower `system_power_consumption`. However, the `Bahaviour Clone` actions may occasionally perform equally well. This is due to the non-deterministic nature of both the *Simulator* when performing `simulator.lookahead()` and the randomness associated with `simulator.sample()`. Moreover, the GPT model associated with the *Simulator* in this example was not trained sufficiently in terms of both the number of epochs and the size of the training data." + "The beam search planner should demonstrate a performance between behaviour cloning and Q-planner, since the idea of beam search is to create a better simulation and ask the planner not to be over-confident about the results." ] }, { @@ -513,7 +641,12 @@ " step_list.append(j)\n", " acc_cost.append(q_accum_cost_list[i][j])\n", " inst_cost.append(q_non_accum_cost_list[i][j])\n", - " policy_list.append(\"q-value\")" + " policy_list.append(\"q-value\")\n", + "\n", + " step_list.append(j)\n", + " acc_cost.append(bsq_accum_cost_list[i][j])\n", + " inst_cost.append(bsq_non_accum_cost_list[i][j])\n", + " policy_list.append(\"beam-search\")" ] }, { @@ -550,6 +683,9 @@ "sns.lineplot(\n", " data=df_result[df_result.policy == \"q-value\"], x=\"step\", y=\"step_cost\", label=\"Q-value optimal\"\n", ")\n", + "sns.lineplot(\n", + " data=df_result[df_result.policy == \"beam-search\"], x=\"step\", y=\"step_cost\", label=\"Beam search\"\n", + ")\n", "plt.legend(fontsize=14)\n", "plt.grid(ls=\"--\")\n", "plt.xlabel(\"Step\", fontsize=16)\n", @@ -568,9 +704,11 @@ "source": [ "data1 = df_result[(df_result.policy == \"behaviour\")]\n", "data2 = df_result[(df_result.policy == \"q-value\")]\n", + "data3 = df_result[(df_result.policy == \"beam-search\")]\n", "\n", "sns.lineplot(data=data1, x=\"step\", y=\"acc_cost\", label=\"Behaviour clone\")\n", "sns.lineplot(data=data2, x=\"step\", y=\"acc_cost\", label=\"Q-value optimal\")\n", + "sns.lineplot(data=data3, x=\"step\", y=\"acc_cost\", label=\"Beam search\")\n", "plt.legend(fontsize=14)\n", "plt.grid(ls=\"--\")\n", "plt.xlabel(\"Step\", fontsize=16)\n", @@ -581,7 +719,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.12 ('a2rl')", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -595,12 +733,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.0" }, "toc-autonumbering": true, "vscode": { "interpreter": { - "hash": "62263fd135fd753cfd7c1bf88d5e743cb8b5f0e0f18aad3aa6722c0590b39cdb" + "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a" } } }, diff --git a/requirements.txt b/requirements.txt index 950bc60..20c5bb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,5 @@ gym>=0.23.1,<0.26.0 seaborn cloudpickle pytorch-lightning>=1.5.0 + +tensorboardX \ No newline at end of file diff --git a/src/a2rl/simulator.py b/src/a2rl/simulator.py index cf71168..95e2689 100644 --- a/src/a2rl/simulator.py +++ b/src/a2rl/simulator.py @@ -1248,6 +1248,101 @@ def _handle_unseen_token(self, seq: np.ndarray) -> np.ndarray: ) return np.array([valid_token[i] for i in neighbors_idx.ravel()]) + @torch.no_grad() + def beam_search_n_steps( + self, + seq: np.ndarray, + n_steps: int, + beam_width: int, + randomness: bool = False, + overwrite_valid_tokens: dict = None, # {"col_name": [valid tokens], ...} + start_col_idx: int = None, + is_gpt_token: bool = False, + return_logprobs: bool = False, + ): + """This function largely replaces A2RL Simulator.gpt_sample_n_steps(). It does not + concern states/actions/rewards and only generates the next N tokens using beam search. + This function is to be used by a planner. + + Args: + seq: A sequence of tokens (1-dimensional only) + n_steps: number of tokens to generate + beam_width: number of beams used in beam search. Must be <= n of valid tokens in + the starting column. Setting this to 1 is equivalent to behaviour cloning. + randomness: if True, will use multinomial sampling of the top-n tokens instead of + deterministic beam search. + overwrite_valid_tokens: dict[ col_name : list of GPT tokens ], overwrite the valid + tokens in a column, useful if additional constriants need to be applied during + inference. + start_col_index: Indicate the starting dataframe column index. Default to + `len(seq) % len(columns)` if None + is_gpt_token: whether the tokens in `seq` are GPT tokens or DataFrame tokens + return_logprobs: if True, the return will be a tuple of tokens and the accumulated + logprobs of each beam. + """ + if seq.ndim != 1: + raise NotImplementedError("batching not implemented") + if overwrite_valid_tokens is None: + overwrite_valid_tokens = dict() + + if not is_gpt_token: + # seq and overwrite_valid_tokens are provided in Dataframe tokens + # Need to convert them to GPT tokens first + seq = self.tokenizer.gpt_tokenize(seq.ravel()).reshape(seq.shape) + + columns = self.tokenizer.columns + if start_col_idx is None: # assume seq is in SARSAR... format + start_col_idx = len(seq) % len(columns) + + seq = torch.tensor(seq, device=self.device).reshape(1, -1) + accum_logprobs = None + + for step in range(n_steps): + col_idx = (start_col_idx + step) % len(columns) + col_name = columns[col_idx] + if col_name in overwrite_valid_tokens: + valid_tokens = overwrite_valid_tokens[col_name] + + if not is_gpt_token: + valid_tokens = self.tokenizer.gpt_tokenize(np.asarray(valid_tokens)) + else: + valid_tokens = get_valid_gpt_token_idx( + self.tokenizer._col_eligible_index, + col_idx, + self.tokenizer.simulator_ds, + ) + + valid_tokens = torch.tensor(valid_tokens, device=self.device) + + if valid_tokens.size(0) == 1: + seq = torch.hstack((seq, valid_tokens.tile(beam_width, 1))) + continue + + logits = self._gpt_predict(seq, self.tokenizer.block_size) # shape = (beam_width, vocab_size) + logits = logits[:, valid_tokens] + logprobs = F.log_softmax(logits, dim=1) + if accum_logprobs is not None: # accum_logprobs is None on 1st loop + logprobs += accum_logprobs.reshape(-1, 1) + + if randomness: + top_indices = torch.multinomial(logprobs.flatten().exp(), beam_width, replacement=False) + accum_logprobs = logprobs.flatten()[top_indices] + else: + accum_logprobs, top_indices = torch.topk(logprobs.flatten(), beam_width) + seq_indices = torch.div(top_indices, valid_tokens.size(0), rounding_mode='floor') + token_indices = torch.remainder(top_indices, valid_tokens.size(0)) + + seq = torch.hstack((seq[seq_indices], valid_tokens[token_indices].reshape(-1, 1))) + + seq, accum_logprobs = seq.cpu().numpy(), accum_logprobs.cpu().numpy() + if not is_gpt_token: + seq = self.tokenizer.gpt_inverse_tokenize(seq.ravel()).reshape(seq.shape) + + if return_logprobs: + return seq, accum_logprobs + + return seq + def sample( self, seq: np.ndarray, diff --git a/test/test_simulator.py b/test/test_simulator.py index 26f1df9..47643a5 100644 --- a/test/test_simulator.py +++ b/test/test_simulator.py @@ -390,7 +390,7 @@ def test_sim_reset(sim): state = sim.reset() assert isinstance(state, np.ndarray) assert len(state) == len(sim.tokenizer.state_indices) - assert state.dtype == np.object + assert state.dtype == object assert sim._ix == 0 @@ -746,6 +746,50 @@ def test_sim_gpt_sample_n_steps(sim, start_col_index, gpt_token_context): assert cur_value in vald_token_range +@pytest.mark.parametrize( + "sim", + [ + pytest.lazy_fixture("sim_mingpt"), # type: ignore[operator] + pytest.lazy_fixture("sim_lightgpt"), # type: ignore[operator] + ], +) +@pytest.mark.parametrize( + "start_col_index, gpt_token_context", + [ + ( + 0, + np.array([0, 2, 16, 18, 5, 11]), + ), + ( + 2, + np.array([0, 2, 16, 18, 5, 11, 0, 2]), + ), + ], +) +def test_sim_beam_search_n_steps(sim: Simulator, start_col_index, gpt_token_context): + NUM_STEP = 4 + BEAM_WIDTH = 2 + # result = sim.gpt_sample_n_steps(gpt_token_context, NUM_STEP, start_col_index) + result = sim.beam_search_n_steps( + gpt_token_context, + NUM_STEP, + BEAM_WIDTH, + start_col_idx=start_col_index, + is_gpt_token=True, + ) + assert result.shape == (BEAM_WIDTH, NUM_STEP + len(gpt_token_context)) + for j in range(BEAM_WIDTH): + for i in range(NUM_STEP): + cur_idx = len(gpt_token_context) + i + cur_value = result[j, cur_idx : cur_idx + 1] + vald_token_range = get_valid_gpt_token_idx( + sim.tokenizer.col_eligible_index, + (start_col_index + i) % sim.tokenizer.column_len, + sim.tokenizer.simulator_ds, + ) + assert cur_value in vald_token_range + + @pytest.mark.parametrize( "sim", [ From b6371dd87b9c6f65626dc4e13f12587281d9b7c9 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Thu, 4 May 2023 10:44:48 +0800 Subject: [PATCH 2/6] Update requirements.txt Hopefully this fixes failing tests --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 20c5bb2..b21ae37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,4 @@ seaborn cloudpickle pytorch-lightning>=1.5.0 -tensorboardX \ No newline at end of file +tensorboardX From 4334747e6dd2d957b0403917ebf67778f35cc8f3 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Thu, 4 May 2023 11:03:05 +0800 Subject: [PATCH 3/6] Bump typeguard to minimum version 3.0.0 --- requirements.txt | 2 +- src/a2rl/_io.py | 12 ++++++------ test/test_tokenizer.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index b21ae37..7146c5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ torch tqdm>=4.64.1 PyYaml>=5.1 typing_extensions -typeguard +typeguard>=3.0.0 nptyping loguru diff --git a/src/a2rl/_io.py b/src/a2rl/_io.py index 3f870cb..7c401b0 100644 --- a/src/a2rl/_io.py +++ b/src/a2rl/_io.py @@ -187,12 +187,12 @@ class Metadata: tags: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: - check_type("states", self.states, List[str]) - check_type("actions", self.actions, List[str]) - check_type("rewards", self.rewards, List[str]) - check_type("forced_categories", self.forced_categories, Optional[List[str]]) - check_type("frequency", self.frequency, Optional[str]) - check_type("tags", self.tags, Dict[str, Any]) + check_type(self.states, List[str]) + check_type(self.actions, List[str]) + check_type(self.rewards, List[str]) + check_type(self.forced_categories, Optional[List[str]]) + check_type(self.frequency, Optional[str]) + check_type(self.tags, Dict[str, Any]) def read_metadata(yaml_file: str | Path) -> Metadata: diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index 22bfa64..36bf8e4 100644 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -201,7 +201,7 @@ def test_valid_tokens(): for i, (col_name, expected_tokens) in enumerate(expected.items()): for c in (i, col_name): actual = t.valid_tokens(c) - check_type(f"actual_valid_tokens", actual, List[Union[int, np.integer]]) + check_type(actual, List[Union[int, np.integer]]) assert expected_tokens == actual From bc91e90530e41558c7473b9aa80c8240493fc98b Mon Sep 17 00:00:00 2001 From: Verdi March Date: Thu, 4 May 2023 11:25:51 +0800 Subject: [PATCH 4/6] Support pandas>=1.5.0 (which deprecates df.iteritems() and sometime later completely drop it) --- notebooks/dataframe.ipynb | 5 ++++- test/test_dataset.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/notebooks/dataframe.ipynb b/notebooks/dataframe.ipynb index 8accf0b..0da03ae 100644 --- a/notebooks/dataframe.ipynb +++ b/notebooks/dataframe.ipynb @@ -239,7 +239,10 @@ "\n", "\n", "# Split the chiller df into 5 series (i.e., 1 for each column)\n", - "sers = [ser for _, ser in df.iteritems()]\n", + "if pd.__version__ >= '1.5.0':\n", + " sers = [ser for _, ser in df.items()]\n", + "else:\n", + " sers = [ser for _, ser in df.iteritems()]\n", "assert_same_sar(sers)\n", "\n", "# Scale the states and rewards\n", diff --git a/test/test_dataset.py b/test/test_dataset.py index a3043b0..ccce05b 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -14,6 +14,7 @@ from unittest import mock +import pandas as pd import pytest from pandas.api.types import is_numeric_dtype @@ -103,5 +104,8 @@ def test_forced_categories(df, tmp_path, forced_categories, expected_types): df.to_csv(p / "data.csv", index=False) df2 = wi.read_csv_dataset(p) - is_numeric_series = [is_numeric_dtype(ser) for _, ser in df2.iteritems()] + if pd.__version__ >= '1.5.0': + is_numeric_series = [is_numeric_dtype(ser) for _, ser in df2.items()] + else: + is_numeric_series = [is_numeric_dtype(ser) for _, ser in df2.iteritems()] assert is_numeric_series == expected_types From 20cdf3fb2a1c64ab070588e14808a8c266d925c6 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Thu, 4 May 2023 11:39:50 +0800 Subject: [PATCH 5/6] Improve docstrings --- src/a2rl/simulator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/a2rl/simulator.py b/src/a2rl/simulator.py index 95e2689..8394d40 100644 --- a/src/a2rl/simulator.py +++ b/src/a2rl/simulator.py @@ -1260,8 +1260,8 @@ def beam_search_n_steps( is_gpt_token: bool = False, return_logprobs: bool = False, ): - """This function largely replaces A2RL Simulator.gpt_sample_n_steps(). It does not - concern states/actions/rewards and only generates the next N tokens using beam search. + """This function largely replaces A2RL :meth:`Simulator.gpt_sample_n_steps()`. It does not + concern states/actions/rewards and only generates the next ``N`` tokens using beam search. This function is to be used by a planner. Args: @@ -1271,12 +1271,12 @@ def beam_search_n_steps( the starting column. Setting this to 1 is equivalent to behaviour cloning. randomness: if True, will use multinomial sampling of the top-n tokens instead of deterministic beam search. - overwrite_valid_tokens: dict[ col_name : list of GPT tokens ], overwrite the valid + overwrite_valid_tokens: ``dict[ col_name : list of GPT tokens ]``, overwrite the valid tokens in a column, useful if additional constriants need to be applied during inference. start_col_index: Indicate the starting dataframe column index. Default to - `len(seq) % len(columns)` if None - is_gpt_token: whether the tokens in `seq` are GPT tokens or DataFrame tokens + ``len(seq) % len(columns)`` if None + is_gpt_token: whether the tokens in ``seq`` are GPT tokens or DataFrame tokens return_logprobs: if True, the return will be a tuple of tokens and the accumulated logprobs of each beam. """ @@ -1340,7 +1340,7 @@ def beam_search_n_steps( if return_logprobs: return seq, accum_logprobs - + return seq def sample( From d5a4366c3164e1393b7a243a41430fb8cab24226 Mon Sep 17 00:00:00 2001 From: Songyi Yang Date: Fri, 5 May 2023 11:57:00 +0100 Subject: [PATCH 6/6] more tests & error for beam_width too large --- .../dynamic_pricing/flight_sales.py | 5 -- .../underfloor_heating_gym_env.py | 2 - src/a2rl/_io.py | 1 + src/a2rl/simulator.py | 9 +++- test/test_simulator.py | 54 +++++++++++++++---- 5 files changed, 53 insertions(+), 18 deletions(-) diff --git a/examples/sagemaker-training/dynamic_pricing/flight_sales.py b/examples/sagemaker-training/dynamic_pricing/flight_sales.py index 0821c4c..295c68d 100644 --- a/examples/sagemaker-training/dynamic_pricing/flight_sales.py +++ b/examples/sagemaker-training/dynamic_pricing/flight_sales.py @@ -95,7 +95,6 @@ def render(self): pass def step(self, action): - self.freight_price = self.config["freight_price"] + np.random.random() self.freight_price = np.round(self.freight_price, decimals=1) @@ -113,9 +112,7 @@ def fsigmoid(x, a, b, c): for i in range(self.visitors): if seats_left > 0: - if np.random.random() < fsigmoid([action], *self.params)[0]: - seats_left -= 1 tickets += 1 @@ -149,7 +146,6 @@ def fsigmoid(x, a, b, c): return state, reward, done, {} def context(self): - return wi.WiDataFrame( self.history.fillna(method="ffill"), states=["season", "freight_price"], @@ -158,7 +154,6 @@ def context(self): ) def reset(self): - self.config = config self.day = 1 self.max_time = self.config["max_time"] diff --git a/examples/underfloor_heating/underfloor_heating_gym_env.py b/examples/underfloor_heating/underfloor_heating_gym_env.py index 520155d..21d04a4 100644 --- a/examples/underfloor_heating/underfloor_heating_gym_env.py +++ b/examples/underfloor_heating/underfloor_heating_gym_env.py @@ -140,7 +140,6 @@ class UnderfloorEnv(gym.Env): """ def __init__(self, env_config: UnderfloorEnvConfig): - if not isinstance(env_config, UnderfloorEnvConfig): raise ValueError(f"Config must be of type UnderfloorEnvConfig, not {type(env_config)}") @@ -228,7 +227,6 @@ def reset(self, **kwargs) -> np.ndarray | tuple[np.ndarray, dict]: return self.state def step(self, action: list[int]) -> tuple[np.ndarray, float, bool, dict]: - state_action = np.concatenate((self.state, action), axis=None) # print(f"{self.state=}") # print(f"{state_action=}") diff --git a/src/a2rl/_io.py b/src/a2rl/_io.py index 7c401b0..f964fc6 100644 --- a/src/a2rl/_io.py +++ b/src/a2rl/_io.py @@ -337,6 +337,7 @@ def save_metadata( tags: {} """ + # Based on https://github.com/yaml/pyyaml/issues/127#issuecomment-525800484 class BlankLiner(yaml.SafeDumper): def write_line_break(self, data=None): diff --git a/src/a2rl/simulator.py b/src/a2rl/simulator.py index 8394d40..6e0c365 100644 --- a/src/a2rl/simulator.py +++ b/src/a2rl/simulator.py @@ -1267,8 +1267,10 @@ def beam_search_n_steps( Args: seq: A sequence of tokens (1-dimensional only) n_steps: number of tokens to generate - beam_width: number of beams used in beam search. Must be <= n of valid tokens in - the starting column. Setting this to 1 is equivalent to behaviour cloning. + beam_width: number of beams used in beam search. Must be <= the vocab size in + the starting column (determined by both valid tokens of that column & + ``overwrite_valid_tokens``, if used). + Setting this to 1 is equivalent to behaviour cloning. randomness: if True, will use multinomial sampling of the top-n tokens instead of deterministic beam search. overwrite_valid_tokens: ``dict[ col_name : list of GPT tokens ]``, overwrite the valid @@ -1324,6 +1326,9 @@ def beam_search_n_steps( if accum_logprobs is not None: # accum_logprobs is None on 1st loop logprobs += accum_logprobs.reshape(-1, 1) + if beam_width > logprobs.numel(): + raise ValueError(f"beam_width cannot be larger than the vocab size of the starting column. Expect beam_width <= {logprobs.numel()}, got {beam_width}") + if randomness: top_indices = torch.multinomial(logprobs.flatten().exp(), beam_width, replacement=False) accum_logprobs = logprobs.flatten()[top_indices] diff --git a/test/test_simulator.py b/test/test_simulator.py index 47643a5..e7b81b1 100644 --- a/test/test_simulator.py +++ b/test/test_simulator.py @@ -764,30 +764,66 @@ def test_sim_gpt_sample_n_steps(sim, start_col_index, gpt_token_context): 2, np.array([0, 2, 16, 18, 5, 11, 0, 2]), ), + ( + None, + np.array([0, 2, 16, 18, 5, 11, 0, 2]), + ), ], ) def test_sim_beam_search_n_steps(sim: Simulator, start_col_index, gpt_token_context): NUM_STEP = 4 BEAM_WIDTH = 2 - # result = sim.gpt_sample_n_steps(gpt_token_context, NUM_STEP, start_col_index) + + # Test randomness=True result = sim.beam_search_n_steps( + gpt_token_context, + NUM_STEP, + BEAM_WIDTH, + randomness=True, + start_col_idx=start_col_index, + is_gpt_token=True, + ) + + # Test overwrite_valid_tokens and return_logprobs + overwrite_valid_tokens = ({"A1": [15]},) + result, accum_logprobs = sim.beam_search_n_steps( gpt_token_context, NUM_STEP, BEAM_WIDTH, start_col_idx=start_col_index, + overwrite_valid_tokens=overwrite_valid_tokens, is_gpt_token=True, + return_logprobs=True, ) + assert result.shape == (BEAM_WIDTH, NUM_STEP + len(gpt_token_context)) + assert accum_logprobs.shape == (BEAM_WIDTH,) + + if start_col_index is None: + start_col_index = len(gpt_token_context) % sim.tokenizer.column_len for j in range(BEAM_WIDTH): for i in range(NUM_STEP): - cur_idx = len(gpt_token_context) + i - cur_value = result[j, cur_idx : cur_idx + 1] - vald_token_range = get_valid_gpt_token_idx( - sim.tokenizer.col_eligible_index, - (start_col_index + i) % sim.tokenizer.column_len, - sim.tokenizer.simulator_ds, - ) - assert cur_value in vald_token_range + cur_value = result[j, len(gpt_token_context) + i] + col_idx = (start_col_index + i) % sim.tokenizer.column_len + col_name = sim.tokenizer.columns[col_idx] + + if col_name in overwrite_valid_tokens: + assert cur_value in overwrite_valid_tokens[col_name] + else: + valid_token_range = get_valid_gpt_token_idx( + sim.tokenizer.col_eligible_index, + col_idx, + sim.tokenizer.simulator_ds, + ) + assert cur_value in valid_token_range + + BEAM_WIDTH = 99 + with pytest.raises( + ValueError, match="beam_width cannot be larger than the vocab size of the starting column" + ): + sim.beam_search_n_steps( + gpt_token_context, 1, BEAM_WIDTH, start_col_idx=start_col_index, is_gpt_token=True + ) @pytest.mark.parametrize(