From d82efc8d7638cabe7d80f8847b715c10b507e421 Mon Sep 17 00:00:00 2001
From: Sebastian Musslick <sebastian@musslick.de>
Date: Tue, 9 Jul 2024 19:03:21 -0400
Subject: [PATCH 1/6] initial commit of q-learning model

---
 .../synthetic/psychology/q_learning.py        | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 src/autora/experiment_runner/synthetic/psychology/q_learning.py

diff --git a/src/autora/experiment_runner/synthetic/psychology/q_learning.py b/src/autora/experiment_runner/synthetic/psychology/q_learning.py
new file mode 100644
index 00000000..5f104fdf
--- /dev/null
+++ b/src/autora/experiment_runner/synthetic/psychology/q_learning.py
@@ -0,0 +1,232 @@
+from functools import partial
+from typing import Optional, Union
+
+import numpy as np
+import pandas as pd
+
+from autora.experiment_runner.synthetic.utilities import SyntheticExperimentCollection
+from autora.variable import DV, IV, ValueType, VariableCollection
+
+def _check_in_0_1_range(x, name):
+  if not (0 <= x <= 1):
+    raise ValueError(
+        f'Value of {name} must be in [0, 1] range. Found value of {x}.')
+
+class AgentQ:
+    """An agent that runs simple Q-learning for an n-armed bandits tasks.
+
+    Attributes:
+      alpha: The agent's learning rate
+      beta: The agent's softmax temperature
+      q: The agent's current estimate of the reward probability on each arm
+    """
+
+    def __init__(
+            self,
+            alpha: float = 0.2,
+            beta: float = 3.,
+            n_actions: int = 2,
+            forget_rate: float = 0.,
+            perseverance_bias: float = 0.,
+            correlated_reward: bool = False,
+    ):
+        """Update the agent after one step of the task.
+
+        Args:
+          alpha: scalar learning rate
+          beta: scalar softmax inverse temperature parameter.
+          n_actions: number of actions (default=2)
+          forgetting_rate: rate at which q values decay toward the initial values (default=0)
+          perseveration_bias: rate at which q values move toward previous action (default=0)
+        """
+        self._prev_choice = -1
+        self._alpha = alpha
+        self._beta = beta
+        self._n_actions = n_actions
+        self._forget_rate = forget_rate
+        self._perseverance_bias = perseverance_bias
+        self._correlated_reward = correlated_reward
+        self._q_init = 0.5
+        self.new_sess()
+
+        _check_in_0_1_range(alpha, 'alpha')
+        _check_in_0_1_range(forget_rate, 'forget_rate')
+
+    def new_sess(self):
+        """Reset the agent for the beginning of a new session."""
+        self._q = self._q_init * np.ones(self._n_actions)
+        self._prev_choice = -1
+
+    def get_choice_probs(self) -> np.ndarray:
+        """Compute the choice probabilities as softmax over q."""
+        decision_variable = np.exp(self.q * self._beta)
+        choice_probs = decision_variable / np.sum(decision_variable)
+        return choice_probs
+
+    def get_choice(self) -> int:
+        """Sample a choice, given the agent's current internal state."""
+        choice_probs = self.get_choice_probs()
+        choice = np.random.choice(self._n_actions, p=choice_probs)
+        return choice
+
+    def update(self,
+               choice: int,
+               reward: float):
+        """Update the agent after one step of the task.
+
+        Args:
+          choice: The choice made by the agent. 0 or 1
+          reward: The reward received by the agent. 0 or 1
+        """
+
+        # Forgetting - restore q-values of non-chosen actions towards the initial value
+        non_chosen_action = np.arange(self._n_actions) != choice
+        self._q[non_chosen_action] = (1 - self._forget_rate) * self._q[
+            non_chosen_action] + self._forget_rate * self._q_init
+
+        # Reward-based update - Update chosen q for chosen action with observed reward
+        q_reward_update = - self._alpha * self._q[choice] + self._alpha * reward
+
+        # Correlated update - Update non-chosen q for non-chosen action with observed reward
+        if self._correlated_reward:
+            # index_correlated_update = self._n_actions - choice - 1
+            # self._q[index_correlated_update] = (1 - self._alpha) * self._q[index_correlated_update] + self._alpha * (1 - reward)
+            # alternative implementation - not dependent on reward but on reward-based update
+            index_correlated_update = self._n_actions - 1 - choice
+            self._q[index_correlated_update] -= 0.5 * q_reward_update
+
+        # Memorize current choice for perseveration
+        self._prev_choice = choice
+
+        self._q[choice] += q_reward_update
+
+    @property
+    def q(self):
+        q = self._q.copy()
+        if self._prev_choice != -1:
+            q[self._prev_choice] += self._perseverance_bias
+        return q
+
+
+def q_learning(
+    name="Q-Learning",
+    learning_rate: float = 0.2,
+    decision_noise: float = 3.,
+    n_actions: int = 2,
+    forget_rate: float = 0.,
+    perseverance_bias: float = 0.,
+    correlated_reward: bool = False,
+):
+    """
+    An agent that runs simple Q-learning for an n-armed bandits tasks.
+
+    Args:
+        name: name of the experiment
+        trials: number of trials
+        learning_rate: learning rate for Q-learning
+        decision_noise: softmax parameter for decision noise
+        n_actions: number of actions
+        forget_rate: rate of forgetting
+        perseverance_bias: bias towards choosing the previously chosen action
+        correlated_reward: whether rewards are correlated
+
+    Examples:
+        >>> experiment = q_learning()
+
+        # The runner can accept numpy arrays or pandas DataFrames, but the return value will
+        # always be a list of numpy arrays. Each array corresponds to the choices made by the agent
+        # for each trial in the input. Thus, arrays have shape (n_trials, n_actions).
+        >>> experiment.run(np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]))
+
+        # The runner can accept pandas DataFrames. Each cell of the DataFrame should contain a
+        # numpy array with shape (n_trials, n_actions). The return value will be a list of numpy
+        # arrays, each corresponding to the choices made by the agent for each trial in the input.
+        >>> experiment.run(pd.DataFrame({'reward array': [[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]}))
+    """
+
+    params = dict(
+        name=name,
+        trials=100,
+        learning_rate = learning_rate,
+        decision_noise = decision_noise,
+        n_actions = n_actions,
+        forget_rate = forget_rate,
+        perseverance_bias = perseverance_bias,
+        correlated_reward = correlated_reward,
+    )
+
+    iv1 = IV(
+        name="reward array",
+        units="reward",
+        variable_label="Reward Sequence",
+        type=ValueType.BOOLEAN,
+    )
+
+    dv1 = DV(
+        name="choice array",
+        units="actions",
+        variable_label="Action Sequence",
+        type=ValueType.REAL,
+    )
+
+    variables = VariableCollection(
+        independent_variables=[iv1],
+        dependent_variables=[dv1],
+    )
+
+    def run_AgentQ(rewards):
+        if (rewards.shape[1] != n_actions):
+            Warning("Number of actions in rewards does not match n_actions. Will use " + str(rewards.shape[1]
+                                                                                             + " actions."))
+        num_trials = rewards.shape[0]
+
+        y = np.zeros(rewards.shape)
+
+        agent = AgentQ(
+            alpha=learning_rate,
+            beta=decision_noise,
+            n_actions=rewards.shape[1],
+            forget_rate=forget_rate,
+            perseverance_bias=perseverance_bias,
+            correlated_reward=correlated_reward,
+        )
+
+        for i in range(num_trials):
+            choice = agent.get_choice()
+            y[i, choice] = 1
+            reward = rewards[i, choice]
+            agent.update(choice, reward)
+        return y
+
+    def run(
+        conditions: Union[pd.DataFrame, np.ndarray, np.recarray],
+        random_state: Optional[int] = None,
+    ):
+
+        Y = list()
+        if isinstance(conditions, pd.DataFrame):
+            for index, session in conditions.iterrows():
+                rewards = session[0]
+                Y.append(run_AgentQ(rewards))
+        elif isinstance(conditions, np.ndarray):
+            Y.append(run_AgentQ(conditions))
+
+        return Y
+
+    ground_truth = partial(run)
+
+    def domain():
+        return None
+
+    collection = SyntheticExperimentCollection(
+        name=name,
+        description=q_learning.__doc__,
+        variables=variables,
+        run=run,
+        ground_truth=ground_truth,
+        domain=domain,
+        params=params,
+        factory_function=q_learning,
+    )
+    return collection
+

From 2e3b90b8a6e78da66de0d8502820e44175ccfb2e Mon Sep 17 00:00:00 2001
From: Sebastian Musslick <sebastian@musslick.de>
Date: Tue, 9 Jul 2024 19:38:58 -0400
Subject: [PATCH 2/6] fixed doc tests

---
 .../synthetic/psychology/q_learning.py        | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/autora/experiment_runner/synthetic/psychology/q_learning.py b/src/autora/experiment_runner/synthetic/psychology/q_learning.py
index 5f104fdf..2570f72f 100644
--- a/src/autora/experiment_runner/synthetic/psychology/q_learning.py
+++ b/src/autora/experiment_runner/synthetic/psychology/q_learning.py
@@ -136,12 +136,24 @@ def q_learning(
         # The runner can accept numpy arrays or pandas DataFrames, but the return value will
         # always be a list of numpy arrays. Each array corresponds to the choices made by the agent
         # for each trial in the input. Thus, arrays have shape (n_trials, n_actions).
-        >>> experiment.run(np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]))
+        >>> experiment.run(np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]), random_state=42)
+        [array([[1., 0.],
+               [0., 1.],
+               [0., 1.],
+               [0., 1.],
+               [1., 0.],
+               [1., 0.]])]
 
         # The runner can accept pandas DataFrames. Each cell of the DataFrame should contain a
         # numpy array with shape (n_trials, n_actions). The return value will be a list of numpy
         # arrays, each corresponding to the choices made by the agent for each trial in the input.
-        >>> experiment.run(pd.DataFrame({'reward array': [[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]}))
+        >>> experiment.run(pd.DataFrame({'reward array': [np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]])]}), random_state = 42)
+        [array([[1., 0.],
+               [0., 1.],
+               [0., 1.],
+               [0., 1.],
+               [1., 0.],
+               [1., 0.]])]
     """
 
     params = dict(
@@ -203,6 +215,9 @@ def run(
         random_state: Optional[int] = None,
     ):
 
+        if random_state is not None:
+            np.random.seed(random_state)
+
         Y = list()
         if isinstance(conditions, pd.DataFrame):
             for index, session in conditions.iterrows():

From 6ae6dd3f415cffde8a654ee073262f8ce14d21ad Mon Sep 17 00:00:00 2001
From: Sebastian Musslick <sebastian@musslick.de>
Date: Tue, 9 Jul 2024 19:43:09 -0400
Subject: [PATCH 3/6] fixed doctest

---
 .../experiment_runner/synthetic/abstract/template_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
index 335d29a8..4ab1f513 100644
--- a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
+++ b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
@@ -29,7 +29,7 @@
                       added_noise=0.0)
 
     >>> s.ground_truth(1.)
-    2.0
+    np.float64(2.0)
 
     >>> s.ground_truth(s.domain())
     array([[1.],

From 8f1d64e40a06a696c0349799e4e369d5c3d45f5f Mon Sep 17 00:00:00 2001
From: Sebastian Musslick <sebastian@musslick.de>
Date: Tue, 9 Jul 2024 19:46:57 -0400
Subject: [PATCH 4/6] fixed doctest

---
 .../experiment_runner/synthetic/abstract/template_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
index 4ab1f513..92fb2319 100644
--- a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
+++ b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
@@ -42,7 +42,7 @@
     <function template_experiment.<locals>.run at 0x...>
 
     >>> s.run(1., random_state=42)
-    2.003047170797544
+    np.float64(2.003047170797544)
 
     >>> s.run(s.domain(), random_state=42)
     array([[1.00304717],

From 8b10830e08c4c70fa981b445acb5f364e9eaede3 Mon Sep 17 00:00:00 2001
From: Sebastian Musslick <sebastian@musslick.de>
Date: Tue, 9 Jul 2024 20:07:32 -0400
Subject: [PATCH 5/6] fixed doctest

---
 .../synthetic/abstract/template_experiment.py                 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
index 92fb2319..17b211bb 100644
--- a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
+++ b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
@@ -28,7 +28,7 @@
     functools.partial(<function template_experiment.<locals>.run at 0x...>,
                       added_noise=0.0)
 
-    >>> s.ground_truth(1.)
+    >>> np.float64(s.ground_truth(1.))
     np.float64(2.0)
 
     >>> s.ground_truth(s.domain())
@@ -41,7 +41,7 @@
     >>> s.run  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     <function template_experiment.<locals>.run at 0x...>
 
-    >>> s.run(1., random_state=42)
+    >>> np.float64(s.run(1., random_state=42))
     np.float64(2.003047170797544)
 
     >>> s.run(s.domain(), random_state=42)

From 1405f3b48ab47cd73a57e3bb37e59a9697f8594b Mon Sep 17 00:00:00 2001
From: Sebastian Musslick <sebastian@musslick.de>
Date: Tue, 9 Jul 2024 20:09:29 -0400
Subject: [PATCH 6/6] fixed doctest

---
 .../synthetic/abstract/template_experiment.py             | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
index 17b211bb..af040d8a 100644
--- a/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
+++ b/src/autora/experiment_runner/synthetic/abstract/template_experiment.py
@@ -28,8 +28,8 @@
     functools.partial(<function template_experiment.<locals>.run at 0x...>,
                       added_noise=0.0)
 
-    >>> np.float64(s.ground_truth(1.))
-    np.float64(2.0)
+    >>> float(s.ground_truth(1.))
+    2.0
 
     >>> s.ground_truth(s.domain())
     array([[1.],
@@ -41,8 +41,8 @@
     >>> s.run  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     <function template_experiment.<locals>.run at 0x...>
 
-    >>> np.float64(s.run(1., random_state=42))
-    np.float64(2.003047170797544)
+    >>> float(s.run(1., random_state=42))
+    2.003047170797544
 
     >>> s.run(s.domain(), random_state=42)
     array([[1.00304717],