From 68e66759c26e5f3974ae7015253c995431629431 Mon Sep 17 00:00:00 2001
From: laurcate <laurcate@amazon.com>
Date: Fri, 31 Mar 2023 10:28:51 +0200
Subject: [PATCH 1/6] beam-search pr

---
 notebooks/planner_byo_example.ipynb | 156 ++++++++++++++++++++++++++--
 requirements.txt                    |   2 +
 src/a2rl/simulator.py               |  95 +++++++++++++++++
 test/test_simulator.py              |  46 +++++++-
 4 files changed, 289 insertions(+), 10 deletions(-)

diff --git a/notebooks/planner_byo_example.ipynb b/notebooks/planner_byo_example.ipynb
index a2210cc..c1b6703 100644
--- a/notebooks/planner_byo_example.ipynb
+++ b/notebooks/planner_byo_example.ipynb
@@ -214,12 +214,14 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Define two planners\n",
+    "## Define three planners\n",
     "\n",
-    "Here we define two planner classes as examples to illustrate how to Bring Your Own planner to work with the `A2RL` simulator. We will add more planners (e.g. `BeamSearchPlanner`, etc.) as needed as per your feedback."
+    "Here we define three planner classes as examples to illustrate how to Bring Your Own planner to work with the `A2RL` simulator.\n",
+    "<!-- We will add more planners (e.g. `BeamSearchPlanner`, etc.) as needed as per your feedback. -->"
    ]
   },
   {
@@ -435,7 +437,112 @@
     "\n",
     "        q_accum_cost_list = q_accum_cost_list.transpose()\n",
     "        q_non_accum_cost_list = q_non_accum_cost_list.transpose()\n",
-    "        return [q_non_accum_cost_list, q_accum_cost_list]\n"
+    "        return [q_non_accum_cost_list, q_accum_cost_list]\n",
+    "\n",
+    "\n",
+    "class BeamSearchQPlanner(A2RLPLanner):\n",
+    "    \"\"\"\n",
+    "    This planner has similar logic to the QPlanner, only it uses `a2rl.Simulator.beam_search_n_steps`\n",
+    "    to obtain all the actions and rewards in one go.\n",
+    "    The actions are still chosen with the highest / lowest sum_reward (immediate_reward + reward-to-go), \n",
+    "    and take that action to the next step.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, simulator: Simulator, beam_width: int, beam_random: bool, objective: str = 'min') -> None:\n",
+    "        super().__init__(simulator)\n",
+    "\n",
+    "        self.beam_width = beam_width\n",
+    "        self.beam_random = beam_random\n",
+    "\n",
+    "        if objective.lower() not in ['min', 'max']:\n",
+    "            raise ValueError('objective must be either min or max')\n",
+    "        if 'min' == objective:\n",
+    "            self.obj_op = np.argmin\n",
+    "        else:\n",
+    "            self.obj_op = np.argmax\n",
+    "\n",
+    "    def rollout(self, horizon: int = 20, nb_runs: int = 3) -> List[np.array]:\n",
+    "        if nb_runs != 1:\n",
+    "            print(\"WARN: multiple runs in beam search is implemented as a loop and not vectorized and performance may be slow\")\n",
+    "\n",
+    "        if nb_runs != 1 and not self.beam_random:\n",
+    "            raise ValueError(\"'beam_random' should be True when using multiple runs\")\n",
+    "\n",
+    "        dataframe_per_run = []\n",
+    "        non_accum_rewards_list = []\n",
+    "        accum_rewards_list = []\n",
+    "\n",
+    "        initial_context = self.tokenizer.df_tokenized.iloc[0, : self.tokenizer.state_dim].values\n",
+    "\n",
+    "        for i_run in range(nb_runs):\n",
+    "            non_accum_rewards = []\n",
+    "\n",
+    "            if initial_context.ndim != 1:\n",
+    "                raise NotImplementedError(\"batching not implemented\")\n",
+    "\n",
+    "            # Overwite some tokens here if you need\n",
+    "            overwrite_valid_tokens = {}\n",
+    "\n",
+    "            # Generate A+R+S tokens each time\n",
+    "            context = initial_context\n",
+    "            n_steps = self.tokenizer.action_dim + self.tokenizer.reward_dim + self.tokenizer.state_dim\n",
+    "\n",
+    "            for i in tqdm(range(horizon)):\n",
+    "                new_context, accum_logprobs = self.simulator.beam_search_n_steps(\n",
+    "                    seq=context,\n",
+    "                    n_steps=n_steps,\n",
+    "                    beam_width=self.beam_width,\n",
+    "                    randomness=self.beam_random,\n",
+    "                    overwrite_valid_tokens=overwrite_valid_tokens,\n",
+    "                    return_logprobs=True,\n",
+    "                )\n",
+    "\n",
+    "                ars_tokens = new_context[:, len(context) :]\n",
+    "                df_ars = wi.WiDataFrame(\n",
+    "                    ars_tokens,\n",
+    "                    **self.tokenizer.df_tokenized.sar_d,\n",
+    "                    columns=[\n",
+    "                        *self.tokenizer.action_columns,\n",
+    "                        *self.tokenizer.reward_columns,\n",
+    "                        *self.tokenizer.state_columns,\n",
+    "                    ],\n",
+    "                )\n",
+    "\n",
+    "                df_sar = df_ars[df_ars.sar]\n",
+    "                df_sar = self.tokenizer.field_tokenizer.inverse_transform(df_sar)\n",
+    "\n",
+    "                rewards = df_sar[self.tokenizer.reward_columns].values\n",
+    "                best_idx = self.obj_op(rewards.sum(axis=1))\n",
+    "                non_accum_rewards.append(rewards[best_idx, 0])\n",
+    "\n",
+    "                context = new_context[best_idx]\n",
+    "\n",
+    "            # Uncomment the following if you want to record a dataframe per run\n",
+    "            # widf_searched = wi.WiDataFrame(\n",
+    "            #     context[len(initial_context) :].reshape(horizon, -1),\n",
+    "            #     **self.tokenizer.df_tokenized.sar_d,\n",
+    "            #     columns=[\n",
+    "            #         *self.tokenizer.df_tokenized.actions,\n",
+    "            #         *self.tokenizer.df_tokenized.rewards,\n",
+    "            #         *self.tokenizer.df_tokenized.states,\n",
+    "            #     ],\n",
+    "            # )\n",
+    "            # widf_searched = widf_searched[widf_searched.sar]\n",
+    "            # widf_searched = self.tokenizer.field_tokenizer.inverse_transform(widf_searched)\n",
+    "            # widf_searched[\"nb_run\"] = i_run\n",
+    "            # widf_searched[\"timestep\"] = range(1, len(widf_searched) + 1)\n",
+    "            # dataframe_per_run.append(widf_searched)\n",
+    "\n",
+    "            non_accum_rewards = np.array(non_accum_rewards)\n",
+    "            accum_rewards = np.cumsum(non_accum_rewards, axis=0)\n",
+    "\n",
+    "            non_accum_rewards_list.append(non_accum_rewards)\n",
+    "            accum_rewards_list.append(accum_rewards)\n",
+    "\n",
+    "        non_accum_rewards_list = np.array(non_accum_rewards_list)\n",
+    "        accum_rewards_list = np.array(accum_rewards_list)\n",
+    "\n",
+    "        return [non_accum_rewards_list, accum_rewards_list]\n"
    ]
   },
   {
@@ -473,12 +580,33 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Compare the costs (`system_power_consumption`) between two planners\n",
+    "### Create and run the `BeamSearchQPlanner` "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bsqp = BeamSearchQPlanner(simulator, beam_width=8, beam_random=True)\n",
+    "bsq_non_accum_cost_list, bsq_accum_cost_list = bsqp.rollout(horizon, nb_runs)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compare the costs (`system_power_consumption`) between three planners\n",
+    "\n",
+    "On average (in the sense of **expected** outcome), the `Q-value Maximisation` planner (`QPlanner` for short) produces relatively lower `system_power_consumption`. However, the `Bahaviour Clone` actions may occasionally perform equally well. This is due to the non-deterministic nature of both the *Simulator* when performing `simulator.lookahead()` and the randomness associated with `simulator.sample()`. Moreover, the GPT model associated with the *Simulator* in this example was not trained sufficiently in terms of both the number of epochs and the size of the training data.\n",
     "\n",
-    "On average (in the sense of **expected** outcome), the `Q-value Maximisation` planner (`QPlanner` for short) produces relatively lower `system_power_consumption`. However, the `Bahaviour Clone` actions may occasionally perform equally well. This is due to the non-deterministic nature of both the *Simulator* when performing `simulator.lookahead()` and the randomness associated with `simulator.sample()`. Moreover, the GPT model associated with the *Simulator* in this example was not trained sufficiently in terms of both the number of epochs and the size of the training data."
+    "The beam search planner should demonstrate a performance between behaviour cloning and Q-planner, since the idea of beam search is to create a better simulation and ask the planner not to be over-confident about the results."
    ]
   },
   {
@@ -513,7 +641,12 @@
     "        step_list.append(j)\n",
     "        acc_cost.append(q_accum_cost_list[i][j])\n",
     "        inst_cost.append(q_non_accum_cost_list[i][j])\n",
-    "        policy_list.append(\"q-value\")"
+    "        policy_list.append(\"q-value\")\n",
+    "\n",
+    "        step_list.append(j)\n",
+    "        acc_cost.append(bsq_accum_cost_list[i][j])\n",
+    "        inst_cost.append(bsq_non_accum_cost_list[i][j])\n",
+    "        policy_list.append(\"beam-search\")"
    ]
   },
   {
@@ -550,6 +683,9 @@
     "sns.lineplot(\n",
     "    data=df_result[df_result.policy == \"q-value\"], x=\"step\", y=\"step_cost\", label=\"Q-value optimal\"\n",
     ")\n",
+    "sns.lineplot(\n",
+    "    data=df_result[df_result.policy == \"beam-search\"], x=\"step\", y=\"step_cost\", label=\"Beam search\"\n",
+    ")\n",
     "plt.legend(fontsize=14)\n",
     "plt.grid(ls=\"--\")\n",
     "plt.xlabel(\"Step\", fontsize=16)\n",
@@ -568,9 +704,11 @@
    "source": [
     "data1 = df_result[(df_result.policy == \"behaviour\")]\n",
     "data2 = df_result[(df_result.policy == \"q-value\")]\n",
+    "data3 = df_result[(df_result.policy == \"beam-search\")]\n",
     "\n",
     "sns.lineplot(data=data1, x=\"step\", y=\"acc_cost\", label=\"Behaviour clone\")\n",
     "sns.lineplot(data=data2, x=\"step\", y=\"acc_cost\", label=\"Q-value optimal\")\n",
+    "sns.lineplot(data=data3, x=\"step\", y=\"acc_cost\", label=\"Beam search\")\n",
     "plt.legend(fontsize=14)\n",
     "plt.grid(ls=\"--\")\n",
     "plt.xlabel(\"Step\", fontsize=16)\n",
@@ -581,7 +719,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.12 ('a2rl')",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -595,12 +733,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.11.0"
   },
   "toc-autonumbering": true,
   "vscode": {
    "interpreter": {
-    "hash": "62263fd135fd753cfd7c1bf88d5e743cb8b5f0e0f18aad3aa6722c0590b39cdb"
+    "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
    }
   }
  },
diff --git a/requirements.txt b/requirements.txt
index 950bc60..20c5bb2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,5 @@ gym>=0.23.1,<0.26.0
 seaborn
 cloudpickle
 pytorch-lightning>=1.5.0
+
+tensorboardX
\ No newline at end of file
diff --git a/src/a2rl/simulator.py b/src/a2rl/simulator.py
index cf71168..95e2689 100644
--- a/src/a2rl/simulator.py
+++ b/src/a2rl/simulator.py
@@ -1248,6 +1248,101 @@ def _handle_unseen_token(self, seq: np.ndarray) -> np.ndarray:
         )
         return np.array([valid_token[i] for i in neighbors_idx.ravel()])
 
+    @torch.no_grad()
+    def beam_search_n_steps(
+        self,
+        seq: np.ndarray,
+        n_steps: int,
+        beam_width: int,
+        randomness: bool = False,
+        overwrite_valid_tokens: dict = None,  # {"col_name": [valid tokens], ...}
+        start_col_idx: int = None,
+        is_gpt_token: bool = False,
+        return_logprobs: bool = False,
+    ):
+        """This function largely replaces A2RL Simulator.gpt_sample_n_steps(). It does not
+        concern states/actions/rewards and only generates the next N tokens using beam search.
+        This function is to be used by a planner.
+
+        Args:
+            seq: A sequence of tokens (1-dimensional only)
+            n_steps: number of tokens to generate
+            beam_width: number of beams used in beam search. Must be <= n of valid tokens in
+                the starting column. Setting this to 1 is equivalent to behaviour cloning.
+            randomness: if True, will use multinomial sampling of the top-n tokens instead of
+                deterministic beam search.
+            overwrite_valid_tokens: dict[ col_name : list of GPT tokens ], overwrite the valid
+                tokens in a column, useful if additional constriants need to be applied during
+                inference.
+            start_col_index: Indicate the starting dataframe column index. Default to
+                `len(seq) % len(columns)` if None
+            is_gpt_token: whether the tokens in `seq` are GPT tokens or DataFrame tokens
+            return_logprobs: if True, the return will be a tuple of tokens and the accumulated
+                logprobs of each beam.
+        """
+        if seq.ndim != 1:
+            raise NotImplementedError("batching not implemented")
+        if overwrite_valid_tokens is None:
+            overwrite_valid_tokens = dict()
+
+        if not is_gpt_token:
+            # seq and overwrite_valid_tokens are provided in Dataframe tokens
+            # Need to convert them to GPT tokens first
+            seq = self.tokenizer.gpt_tokenize(seq.ravel()).reshape(seq.shape)
+
+        columns = self.tokenizer.columns
+        if start_col_idx is None:  # assume seq is in SARSAR... format
+            start_col_idx = len(seq) % len(columns)
+
+        seq = torch.tensor(seq, device=self.device).reshape(1, -1)
+        accum_logprobs = None
+
+        for step in range(n_steps):
+            col_idx = (start_col_idx + step) % len(columns)
+            col_name = columns[col_idx]
+            if col_name in overwrite_valid_tokens:
+                valid_tokens = overwrite_valid_tokens[col_name]
+
+                if not is_gpt_token:
+                    valid_tokens = self.tokenizer.gpt_tokenize(np.asarray(valid_tokens))
+            else:
+                valid_tokens = get_valid_gpt_token_idx(
+                    self.tokenizer._col_eligible_index,
+                    col_idx,
+                    self.tokenizer.simulator_ds,
+                )
+
+            valid_tokens = torch.tensor(valid_tokens, device=self.device)
+
+            if valid_tokens.size(0) == 1:
+                seq = torch.hstack((seq, valid_tokens.tile(beam_width, 1)))
+                continue
+
+            logits = self._gpt_predict(seq, self.tokenizer.block_size)  # shape = (beam_width, vocab_size)
+            logits = logits[:, valid_tokens]
+            logprobs = F.log_softmax(logits, dim=1)
+            if accum_logprobs is not None:  # accum_logprobs is None on 1st loop
+                logprobs += accum_logprobs.reshape(-1, 1)
+
+            if randomness:
+                top_indices = torch.multinomial(logprobs.flatten().exp(), beam_width, replacement=False)
+                accum_logprobs = logprobs.flatten()[top_indices]
+            else:
+                accum_logprobs, top_indices = torch.topk(logprobs.flatten(), beam_width)
+            seq_indices = torch.div(top_indices, valid_tokens.size(0), rounding_mode='floor')
+            token_indices = torch.remainder(top_indices, valid_tokens.size(0))
+
+            seq = torch.hstack((seq[seq_indices], valid_tokens[token_indices].reshape(-1, 1)))
+
+        seq, accum_logprobs = seq.cpu().numpy(), accum_logprobs.cpu().numpy()
+        if not is_gpt_token:
+            seq = self.tokenizer.gpt_inverse_tokenize(seq.ravel()).reshape(seq.shape)
+
+        if return_logprobs:
+            return seq, accum_logprobs
+    
+        return seq
+
     def sample(
         self,
         seq: np.ndarray,
diff --git a/test/test_simulator.py b/test/test_simulator.py
index 26f1df9..47643a5 100644
--- a/test/test_simulator.py
+++ b/test/test_simulator.py
@@ -390,7 +390,7 @@ def test_sim_reset(sim):
     state = sim.reset()
     assert isinstance(state, np.ndarray)
     assert len(state) == len(sim.tokenizer.state_indices)
-    assert state.dtype == np.object
+    assert state.dtype == object
     assert sim._ix == 0
 
 
@@ -746,6 +746,50 @@ def test_sim_gpt_sample_n_steps(sim, start_col_index, gpt_token_context):
         assert cur_value in vald_token_range
 
 
+@pytest.mark.parametrize(
+    "sim",
+    [
+        pytest.lazy_fixture("sim_mingpt"),  # type: ignore[operator]
+        pytest.lazy_fixture("sim_lightgpt"),  # type: ignore[operator]
+    ],
+)
+@pytest.mark.parametrize(
+    "start_col_index, gpt_token_context",
+    [
+        (
+            0,
+            np.array([0, 2, 16, 18, 5, 11]),
+        ),
+        (
+            2,
+            np.array([0, 2, 16, 18, 5, 11, 0, 2]),
+        ),
+    ],
+)
+def test_sim_beam_search_n_steps(sim: Simulator, start_col_index, gpt_token_context):
+    NUM_STEP = 4
+    BEAM_WIDTH = 2
+    # result = sim.gpt_sample_n_steps(gpt_token_context, NUM_STEP, start_col_index)
+    result = sim.beam_search_n_steps(
+        gpt_token_context,
+        NUM_STEP,
+        BEAM_WIDTH,
+        start_col_idx=start_col_index,
+        is_gpt_token=True,
+    )
+    assert result.shape == (BEAM_WIDTH, NUM_STEP + len(gpt_token_context))
+    for j in range(BEAM_WIDTH):
+        for i in range(NUM_STEP):
+            cur_idx = len(gpt_token_context) + i
+            cur_value = result[j, cur_idx : cur_idx + 1]
+            vald_token_range = get_valid_gpt_token_idx(
+                sim.tokenizer.col_eligible_index,
+                (start_col_index + i) % sim.tokenizer.column_len,
+                sim.tokenizer.simulator_ds,
+            )
+            assert cur_value in vald_token_range
+
+
 @pytest.mark.parametrize(
     "sim",
     [

From b6371dd87b9c6f65626dc4e13f12587281d9b7c9 Mon Sep 17 00:00:00 2001
From: Verdi March <verdimrc@users.noreply.github.com>
Date: Thu, 4 May 2023 10:44:48 +0800
Subject: [PATCH 2/6] Update requirements.txt

Hopefully this fixes failing tests
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 20c5bb2..b21ae37 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,4 +18,4 @@ seaborn
 cloudpickle
 pytorch-lightning>=1.5.0
 
-tensorboardX
\ No newline at end of file
+tensorboardX

From 4334747e6dd2d957b0403917ebf67778f35cc8f3 Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Thu, 4 May 2023 11:03:05 +0800
Subject: [PATCH 3/6] Bump typeguard to minimum version 3.0.0

---
 requirements.txt       |  2 +-
 src/a2rl/_io.py        | 12 ++++++------
 test/test_tokenizer.py |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b21ae37..7146c5b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ torch
 tqdm>=4.64.1
 PyYaml>=5.1
 typing_extensions
-typeguard
+typeguard>=3.0.0
 nptyping
 loguru
 
diff --git a/src/a2rl/_io.py b/src/a2rl/_io.py
index 3f870cb..7c401b0 100644
--- a/src/a2rl/_io.py
+++ b/src/a2rl/_io.py
@@ -187,12 +187,12 @@ class Metadata:
     tags: dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self) -> None:
-        check_type("states", self.states, List[str])
-        check_type("actions", self.actions, List[str])
-        check_type("rewards", self.rewards, List[str])
-        check_type("forced_categories", self.forced_categories, Optional[List[str]])
-        check_type("frequency", self.frequency, Optional[str])
-        check_type("tags", self.tags, Dict[str, Any])
+        check_type(self.states, List[str])
+        check_type(self.actions, List[str])
+        check_type(self.rewards, List[str])
+        check_type(self.forced_categories, Optional[List[str]])
+        check_type(self.frequency, Optional[str])
+        check_type(self.tags, Dict[str, Any])
 
 
 def read_metadata(yaml_file: str | Path) -> Metadata:
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index 22bfa64..36bf8e4 100644
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -201,7 +201,7 @@ def test_valid_tokens():
     for i, (col_name, expected_tokens) in enumerate(expected.items()):
         for c in (i, col_name):
             actual = t.valid_tokens(c)
-            check_type(f"actual_valid_tokens", actual, List[Union[int, np.integer]])
+            check_type(actual, List[Union[int, np.integer]])
             assert expected_tokens == actual
 
 

From bc91e90530e41558c7473b9aa80c8240493fc98b Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Thu, 4 May 2023 11:25:51 +0800
Subject: [PATCH 4/6] Support pandas>=1.5.0 (which deprecates df.iteritems()
 and sometime later completely drop it)

---
 notebooks/dataframe.ipynb | 5 ++++-
 test/test_dataset.py      | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/notebooks/dataframe.ipynb b/notebooks/dataframe.ipynb
index 8accf0b..0da03ae 100644
--- a/notebooks/dataframe.ipynb
+++ b/notebooks/dataframe.ipynb
@@ -239,7 +239,10 @@
     "\n",
     "\n",
     "# Split the chiller df into 5 series (i.e., 1 for each column)\n",
-    "sers = [ser for _, ser in df.iteritems()]\n",
+    "if pd.__version__ >= '1.5.0':\n",
+    "    sers = [ser for _, ser in df.items()]\n",
+    "else:\n",
+    "    sers = [ser for _, ser in df.iteritems()]\n",
     "assert_same_sar(sers)\n",
     "\n",
     "# Scale the states and rewards\n",
diff --git a/test/test_dataset.py b/test/test_dataset.py
index a3043b0..ccce05b 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -14,6 +14,7 @@
 
 from unittest import mock
 
+import pandas as pd
 import pytest
 from pandas.api.types import is_numeric_dtype
 
@@ -103,5 +104,8 @@ def test_forced_categories(df, tmp_path, forced_categories, expected_types):
     df.to_csv(p / "data.csv", index=False)
 
     df2 = wi.read_csv_dataset(p)
-    is_numeric_series = [is_numeric_dtype(ser) for _, ser in df2.iteritems()]
+    if pd.__version__ >= '1.5.0':
+        is_numeric_series = [is_numeric_dtype(ser) for _, ser in df2.items()]
+    else:
+        is_numeric_series = [is_numeric_dtype(ser) for _, ser in df2.iteritems()]
     assert is_numeric_series == expected_types

From 20cdf3fb2a1c64ab070588e14808a8c266d925c6 Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Thu, 4 May 2023 11:39:50 +0800
Subject: [PATCH 5/6] Improve docstrings

---
 src/a2rl/simulator.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/a2rl/simulator.py b/src/a2rl/simulator.py
index 95e2689..8394d40 100644
--- a/src/a2rl/simulator.py
+++ b/src/a2rl/simulator.py
@@ -1260,8 +1260,8 @@ def beam_search_n_steps(
         is_gpt_token: bool = False,
         return_logprobs: bool = False,
     ):
-        """This function largely replaces A2RL Simulator.gpt_sample_n_steps(). It does not
-        concern states/actions/rewards and only generates the next N tokens using beam search.
+        """This function largely replaces A2RL :meth:`Simulator.gpt_sample_n_steps()`. It does not
+        concern states/actions/rewards and only generates the next ``N`` tokens using beam search.
         This function is to be used by a planner.
 
         Args:
@@ -1271,12 +1271,12 @@ def beam_search_n_steps(
                 the starting column. Setting this to 1 is equivalent to behaviour cloning.
             randomness: if True, will use multinomial sampling of the top-n tokens instead of
                 deterministic beam search.
-            overwrite_valid_tokens: dict[ col_name : list of GPT tokens ], overwrite the valid
+            overwrite_valid_tokens: ``dict[ col_name : list of GPT tokens ]``, overwrite the valid
                 tokens in a column, useful if additional constriants need to be applied during
                 inference.
             start_col_index: Indicate the starting dataframe column index. Default to
-                `len(seq) % len(columns)` if None
-            is_gpt_token: whether the tokens in `seq` are GPT tokens or DataFrame tokens
+                ``len(seq) % len(columns)`` if None
+            is_gpt_token: whether the tokens in ``seq`` are GPT tokens or DataFrame tokens
             return_logprobs: if True, the return will be a tuple of tokens and the accumulated
                 logprobs of each beam.
         """
@@ -1340,7 +1340,7 @@ def beam_search_n_steps(
 
         if return_logprobs:
             return seq, accum_logprobs
-    
+
         return seq
 
     def sample(

From d5a4366c3164e1393b7a243a41430fb8cab24226 Mon Sep 17 00:00:00 2001
From: Songyi Yang <songyiy@amazon.co.uk>
Date: Fri, 5 May 2023 11:57:00 +0100
Subject: [PATCH 6/6] more tests & error for beam_width too large

---
 .../dynamic_pricing/flight_sales.py           |  5 --
 .../underfloor_heating_gym_env.py             |  2 -
 src/a2rl/_io.py                               |  1 +
 src/a2rl/simulator.py                         |  9 +++-
 test/test_simulator.py                        | 54 +++++++++++++++----
 5 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/examples/sagemaker-training/dynamic_pricing/flight_sales.py b/examples/sagemaker-training/dynamic_pricing/flight_sales.py
index 0821c4c..295c68d 100644
--- a/examples/sagemaker-training/dynamic_pricing/flight_sales.py
+++ b/examples/sagemaker-training/dynamic_pricing/flight_sales.py
@@ -95,7 +95,6 @@ def render(self):
         pass
 
     def step(self, action):
-
         self.freight_price = self.config["freight_price"] + np.random.random()
         self.freight_price = np.round(self.freight_price, decimals=1)
 
@@ -113,9 +112,7 @@ def fsigmoid(x, a, b, c):
 
         for i in range(self.visitors):
             if seats_left > 0:
-
                 if np.random.random() < fsigmoid([action], *self.params)[0]:
-
                     seats_left -= 1
                     tickets += 1
 
@@ -149,7 +146,6 @@ def fsigmoid(x, a, b, c):
         return state, reward, done, {}
 
     def context(self):
-
         return wi.WiDataFrame(
             self.history.fillna(method="ffill"),
             states=["season", "freight_price"],
@@ -158,7 +154,6 @@ def context(self):
         )
 
     def reset(self):
-
         self.config = config
         self.day = 1
         self.max_time = self.config["max_time"]
diff --git a/examples/underfloor_heating/underfloor_heating_gym_env.py b/examples/underfloor_heating/underfloor_heating_gym_env.py
index 520155d..21d04a4 100644
--- a/examples/underfloor_heating/underfloor_heating_gym_env.py
+++ b/examples/underfloor_heating/underfloor_heating_gym_env.py
@@ -140,7 +140,6 @@ class UnderfloorEnv(gym.Env):
     """
 
     def __init__(self, env_config: UnderfloorEnvConfig):
-
         if not isinstance(env_config, UnderfloorEnvConfig):
             raise ValueError(f"Config must be of type UnderfloorEnvConfig, not {type(env_config)}")
 
@@ -228,7 +227,6 @@ def reset(self, **kwargs) -> np.ndarray | tuple[np.ndarray, dict]:
             return self.state
 
     def step(self, action: list[int]) -> tuple[np.ndarray, float, bool, dict]:
-
         state_action = np.concatenate((self.state, action), axis=None)
         # print(f"{self.state=}")
         # print(f"{state_action=}")
diff --git a/src/a2rl/_io.py b/src/a2rl/_io.py
index 7c401b0..f964fc6 100644
--- a/src/a2rl/_io.py
+++ b/src/a2rl/_io.py
@@ -337,6 +337,7 @@ def save_metadata(
             tags: {}
             <BLANKLINE>
     """
+
     # Based on https://github.com/yaml/pyyaml/issues/127#issuecomment-525800484
     class BlankLiner(yaml.SafeDumper):
         def write_line_break(self, data=None):
diff --git a/src/a2rl/simulator.py b/src/a2rl/simulator.py
index 8394d40..6e0c365 100644
--- a/src/a2rl/simulator.py
+++ b/src/a2rl/simulator.py
@@ -1267,8 +1267,10 @@ def beam_search_n_steps(
         Args:
             seq: A sequence of tokens (1-dimensional only)
             n_steps: number of tokens to generate
-            beam_width: number of beams used in beam search. Must be <= n of valid tokens in
-                the starting column. Setting this to 1 is equivalent to behaviour cloning.
+            beam_width: number of beams used in beam search. Must be <= the vocab size in
+                the starting column (determined by both valid tokens of that column &
+                ``overwrite_valid_tokens``, if used).
+                Setting this to 1 is equivalent to behaviour cloning.
             randomness: if True, will use multinomial sampling of the top-n tokens instead of
                 deterministic beam search.
             overwrite_valid_tokens: ``dict[ col_name : list of GPT tokens ]``, overwrite the valid
@@ -1324,6 +1326,9 @@ def beam_search_n_steps(
             if accum_logprobs is not None:  # accum_logprobs is None on 1st loop
                 logprobs += accum_logprobs.reshape(-1, 1)
 
+            if beam_width > logprobs.numel():
+                raise ValueError(f"beam_width cannot be larger than the vocab size of the starting column. Expect beam_width <= {logprobs.numel()}, got {beam_width}")
+
             if randomness:
                 top_indices = torch.multinomial(logprobs.flatten().exp(), beam_width, replacement=False)
                 accum_logprobs = logprobs.flatten()[top_indices]
diff --git a/test/test_simulator.py b/test/test_simulator.py
index 47643a5..e7b81b1 100644
--- a/test/test_simulator.py
+++ b/test/test_simulator.py
@@ -764,30 +764,66 @@ def test_sim_gpt_sample_n_steps(sim, start_col_index, gpt_token_context):
             2,
             np.array([0, 2, 16, 18, 5, 11, 0, 2]),
         ),
+        (
+            None,
+            np.array([0, 2, 16, 18, 5, 11, 0, 2]),
+        ),
     ],
 )
 def test_sim_beam_search_n_steps(sim: Simulator, start_col_index, gpt_token_context):
     NUM_STEP = 4
     BEAM_WIDTH = 2
-    # result = sim.gpt_sample_n_steps(gpt_token_context, NUM_STEP, start_col_index)
+
+    # Test randomness=True
     result = sim.beam_search_n_steps(
+        gpt_token_context,
+        NUM_STEP,
+        BEAM_WIDTH,
+        randomness=True,
+        start_col_idx=start_col_index,
+        is_gpt_token=True,
+    )
+
+    # Test overwrite_valid_tokens and return_logprobs
+    overwrite_valid_tokens = ({"A1": [15]},)
+    result, accum_logprobs = sim.beam_search_n_steps(
         gpt_token_context,
         NUM_STEP,
         BEAM_WIDTH,
         start_col_idx=start_col_index,
+        overwrite_valid_tokens=overwrite_valid_tokens,
         is_gpt_token=True,
+        return_logprobs=True,
     )
+
     assert result.shape == (BEAM_WIDTH, NUM_STEP + len(gpt_token_context))
+    assert accum_logprobs.shape == (BEAM_WIDTH,)
+
+    if start_col_index is None:
+        start_col_index = len(gpt_token_context) % sim.tokenizer.column_len
     for j in range(BEAM_WIDTH):
         for i in range(NUM_STEP):
-            cur_idx = len(gpt_token_context) + i
-            cur_value = result[j, cur_idx : cur_idx + 1]
-            vald_token_range = get_valid_gpt_token_idx(
-                sim.tokenizer.col_eligible_index,
-                (start_col_index + i) % sim.tokenizer.column_len,
-                sim.tokenizer.simulator_ds,
-            )
-            assert cur_value in vald_token_range
+            cur_value = result[j, len(gpt_token_context) + i]
+            col_idx = (start_col_index + i) % sim.tokenizer.column_len
+            col_name = sim.tokenizer.columns[col_idx]
+
+            if col_name in overwrite_valid_tokens:
+                assert cur_value in overwrite_valid_tokens[col_name]
+            else:
+                valid_token_range = get_valid_gpt_token_idx(
+                    sim.tokenizer.col_eligible_index,
+                    col_idx,
+                    sim.tokenizer.simulator_ds,
+                )
+                assert cur_value in valid_token_range
+
+    BEAM_WIDTH = 99
+    with pytest.raises(
+        ValueError, match="beam_width cannot be larger than the vocab size of the starting column"
+    ):
+        sim.beam_search_n_steps(
+            gpt_token_context, 1, BEAM_WIDTH, start_col_idx=start_col_index, is_gpt_token=True
+        )
 
 
 @pytest.mark.parametrize(