diff --git a/src/seals/base_envs.py b/src/seals/base_envs.py index 7edcf90..c7b29c6 100644 --- a/src/seals/base_envs.py +++ b/src/seals/base_envs.py @@ -212,7 +212,7 @@ def __init__( *, transition_matrix: np.ndarray, reward_matrix: np.ndarray, - horizon: float = np.inf, + horizon: Optional[int] = None, initial_state_dist: Optional[np.ndarray] = None, ): """Build tabular environment. @@ -226,7 +226,8 @@ def __init__( assumes neither the `action` nor `next_state` are used. Of shape `(n_states,n_actions,n_states)[:n]` where `n` is the dimensionality of the array. - horizon: Maximum number of timesteps, default `np.inf`. + horizon: Maximum number of timesteps. The default is `None`, + which represents an infinite horizon. initial_state_dist: Distribution from which state is sampled at the start of the episode. If `None`, it is assumed initial state is always 0. Shape `(n_states,)`. @@ -314,7 +315,7 @@ def reward(self, state: int, action: int, new_state: int) -> float: def terminal(self, state: int, n_actions_taken: int) -> bool: """Checks if state is terminal.""" - return n_actions_taken >= self.horizon + return self.horizon is not None and n_actions_taken >= self.horizon @property def feature_matrix(self): @@ -356,7 +357,7 @@ def __init__( transition_matrix: np.ndarray, observation_matrix: np.ndarray, reward_matrix: np.ndarray, - horizon: float = np.inf, + horizon: Optional[int] = None, initial_state_dist: Optional[np.ndarray] = None, ): """Initializes a tabular model POMDP.""" @@ -423,7 +424,7 @@ def __init__( *, transition_matrix: np.ndarray, reward_matrix: np.ndarray, - horizon: float = np.inf, + horizon: Optional[int] = None, initial_state_dist: Optional[np.ndarray] = None, ): """Initializes a tabular model MDP.