From 65d5f18cb9b4be5a1048b088546af4d8fe3b7697 Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Sat, 8 Aug 2015 16:37:44 +0200 Subject: [PATCH 1/7] Moving steps/sec measurement to experiment class --- deep_q_rl/ale_agent.py | 5 ----- deep_q_rl/ale_experiment.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/deep_q_rl/ale_agent.py b/deep_q_rl/ale_agent.py index 5ec16b4..f09a67c 100755 --- a/deep_q_rl/ale_agent.py +++ b/deep_q_rl/ale_agent.py @@ -132,7 +132,6 @@ def start_episode(self, observation): # We report the mean loss for every epoch. self.loss_averages = [] - self.start_time = time.time() return_action = self.rng.randint(0, self.num_actions) self.last_action = return_action @@ -244,7 +243,6 @@ def end_episode(self, reward, terminal=True): self.episode_reward += reward self.step_counter += 1 - total_time = time.time() - self.start_time if self.testing: # If we run out of time, only count the last episode if @@ -260,9 +258,6 @@ def end_episode(self, reward, terminal=True): np.clip(reward, -1, 1), True) - logging.info("steps/second: {:.2f}".format(\ - self.step_counter/total_time)) - if self.batch_counter > 0: self._update_learning_file() logging.info("average loss: {:.4f}".format(\ diff --git a/deep_q_rl/ale_experiment.py b/deep_q_rl/ale_experiment.py index a9a4477..2913b59 100755 --- a/deep_q_rl/ale_experiment.py +++ b/deep_q_rl/ale_experiment.py @@ -11,6 +11,8 @@ # Number of rows to crop off the bottom of the (downsampled) screen. # This is appropriate for breakout, but it may need to be modified # for other games. +import time + CROP_OFFSET = 8 @@ -136,8 +138,12 @@ def run_episode(self, max_steps, testing): start_lives = self.ale.lives() + t0 = time.time() + action = self.agent.start_episode(self.get_observation()) num_steps = 0 + terminal = False + while True: reward = self._step(self.min_action_set[action]) self.terminal_lol = (self.death_ends_episode and not testing and @@ -150,6 +156,12 @@ def run_episode(self, max_steps, testing): break action = self.agent.step(reward, self.get_observation()) + + if not testing: + t1 = time.time() + total_time = t1 - t0 + logging.info("steps/second: {:.2f}".format(num_steps/total_time)) + return terminal, num_steps From b966ff83bada403da9526285760f55f77ff040c6 Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Mon, 17 Aug 2015 15:50:18 +0200 Subject: [PATCH 2/7] Moving logging functionality to separate methods --- deep_q_rl/ale_agent.py | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/deep_q_rl/ale_agent.py b/deep_q_rl/ale_agent.py index f09a67c..18b9426 100755 --- a/deep_q_rl/ale_agent.py +++ b/deep_q_rl/ale_agent.py @@ -73,9 +73,6 @@ def __init__(self, q_network, epsilon_start, epsilon_min, self.testing = False - self._open_results_file() - self._open_learning_file() - self.episode_counter = 0 self.batch_counter = 0 @@ -87,15 +84,32 @@ def __init__(self, q_network, epsilon_start, epsilon_min, self.last_img = None self.last_action = None + self.export_dir = self._create_export_dir() + self._open_results_file() + self._open_learning_file() + # region Dumping/Logging + def _create_export_dir(self): + # CREATE A FOLDER TO HOLD RESULTS + time_str = time.strftime("_%m-%d-%H-%M_", time.gmtime()) + export_dir = self.exp_pref + time_str + \ + "{}".format(self.parameters.learning_rate).replace(".", "p") + "_" \ + + "{}".format(self.parameters.discount).replace(".", "p") + try: + os.stat(export_dir) + except OSError: + os.makedirs(export_dir) + + return export_dir + def _open_results_file(self): - logging.info("OPENING " + self.exp_dir + '/results.csv') - self.results_file = open(self.exp_dir + '/results.csv', 'w', 0) + logging.info("OPENING " + self.export_dir + '/results.csv') + self.results_file = open(self.export_dir + '/results.csv', 'w', 0) self.results_file.write(\ 'epoch,num_episodes,total_reward,reward_per_epoch,mean_q\n') self.results_file.flush() def _open_learning_file(self): - self.learning_file = open(self.exp_dir + '/learning.csv', 'w', 0) + self.learning_file = open(self.export_dir + '/learning.csv', 'w', 0) self.learning_file.write('mean_loss,epsilon\n') self.learning_file.flush() @@ -112,6 +126,12 @@ def _update_learning_file(self): self.learning_file.write(out) self.learning_file.flush() + def _persist_network(self, network_filename): + full_filename = os.path.join(self.export_dir, network_filename) + with open(full_filename, 'w') as net_file: + cPickle.dump(self.network, net_file, -1) + + # endregion def start_episode(self, observation): """ This method is called once at the beginning of each episode. @@ -260,15 +280,11 @@ def end_episode(self, reward, terminal=True): if self.batch_counter > 0: self._update_learning_file() - logging.info("average loss: {:.4f}".format(\ - np.mean(self.loss_averages))) - + logging.info("average loss: {:.4f}".format(np.mean(self.loss_averages))) def finish_epoch(self, epoch): - net_file = open(self.exp_dir + '/network_file_' + str(epoch) + \ - '.pkl', 'w') - cPickle.dump(self.network, net_file, -1) - net_file.close() + network_filename = 'network_file_' + str(epoch) + '.pkl' + self._persist_network(network_filename) def start_testing(self): self.testing = True From 932d5b427e73c0e67208ad60531ef5d98c7a9cde Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Mon, 17 Aug 2015 16:44:29 +0200 Subject: [PATCH 3/7] Measuring the whole episode (including logging, etc.) --- deep_q_rl/ale_experiment.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/deep_q_rl/ale_experiment.py b/deep_q_rl/ale_experiment.py index 2913b59..f5e4e72 100755 --- a/deep_q_rl/ale_experiment.py +++ b/deep_q_rl/ale_experiment.py @@ -72,12 +72,15 @@ def run_epoch(self, epoch, num_steps, testing=False): steps_left = num_steps while steps_left > 0: prefix = "testing" if testing else "training" - logging.info(prefix + " epoch: " + str(epoch) + " steps_left: " + - str(steps_left)) - _, num_steps = self.run_episode(steps_left, testing) + t0 = time.time() + _, num_steps = self.run_episode(steps_left, testing) steps_left -= num_steps + t1 = time.time() + total_time = t1 - t0 + logging.info("[{:8}] epoch {:3} | num_steps {:7} steps_left {:7} steps/second: {:>7.2f}".format( + prefix, epoch, num_steps, steps_left, num_steps / total_time)) def _init_episode(self): """ This method resets the game if needed, performs enough null @@ -138,8 +141,6 @@ def run_episode(self, max_steps, testing): start_lives = self.ale.lives() - t0 = time.time() - action = self.agent.start_episode(self.get_observation()) num_steps = 0 terminal = False @@ -157,14 +158,8 @@ def run_episode(self, max_steps, testing): action = self.agent.step(reward, self.get_observation()) - if not testing: - t1 = time.time() - total_time = t1 - t0 - logging.info("steps/second: {:.2f}".format(num_steps/total_time)) - return terminal, num_steps - def get_observation(self): """ Resize and merge the previous two screen images """ From 8fc40a4db104efbf7865badaaf9d4ede2636fb96 Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Mon, 17 Aug 2015 16:54:55 +0200 Subject: [PATCH 4/7] Making plotting scripts executable --- deep_q_rl/plot_filters.py | 1 + deep_q_rl/plot_results.py | 1 + 2 files changed, 2 insertions(+) mode change 100644 => 100755 deep_q_rl/plot_filters.py mode change 100644 => 100755 deep_q_rl/plot_results.py diff --git a/deep_q_rl/plot_filters.py b/deep_q_rl/plot_filters.py old mode 100644 new mode 100755 index 52a3da6..e79e199 --- a/deep_q_rl/plot_filters.py +++ b/deep_q_rl/plot_filters.py @@ -1,3 +1,4 @@ +#! /usr/bin/env python """ Utility to plot the first layer of convolutions learned by the Deep q-network. diff --git a/deep_q_rl/plot_results.py b/deep_q_rl/plot_results.py old mode 100644 new mode 100755 index 8588cf4..9743e44 --- a/deep_q_rl/plot_results.py +++ b/deep_q_rl/plot_results.py @@ -1,3 +1,4 @@ +#! /usr/bin/env python """Plots data corresponding to Figure 2 in Playing Atari with Deep Reinforcement Learning From dec12ec6ec32a31d3f0a2a1abbb702fc71c000dc Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Mon, 17 Aug 2015 17:06:09 +0200 Subject: [PATCH 5/7] simplify launcher and generalize constructors through parameters create base classes updating unit tests --- deep_q_rl/ale_agent.py | 76 ++++++++------ deep_q_rl/ale_agent_base.py | 66 ++++++++++++ deep_q_rl/ale_agent_random.py | 31 ++++++ deep_q_rl/ale_experiment.py | 6 +- deep_q_rl/ale_parameters_default.py | 3 + deep_q_rl/launcher.py | 63 ++++-------- deep_q_rl/q_learner.py | 30 ++++++ deep_q_rl/q_network.py | 149 ++++++++++++++-------------- deep_q_rl/run_nature.py | 13 ++- deep_q_rl/run_nips.py | 11 +- deep_q_rl/run_random.py | 66 ++++++++++++ deep_q_rl/test/test_q_network.py | 103 +++++++++---------- 12 files changed, 404 insertions(+), 213 deletions(-) mode change 100755 => 100644 deep_q_rl/ale_agent.py create mode 100644 deep_q_rl/ale_agent_base.py create mode 100644 deep_q_rl/ale_agent_random.py mode change 100755 => 100644 deep_q_rl/ale_experiment.py create mode 100644 deep_q_rl/ale_parameters_default.py create mode 100644 deep_q_rl/q_learner.py create mode 100755 deep_q_rl/run_random.py diff --git a/deep_q_rl/ale_agent.py b/deep_q_rl/ale_agent.py old mode 100755 new mode 100644 index 18b9426..ae735ef --- a/deep_q_rl/ale_agent.py +++ b/deep_q_rl/ale_agent.py @@ -13,44 +13,34 @@ import numpy as np +from ale_agent_base import AgentBase import ale_data_set import sys sys.setrecursionlimit(10000) -class NeuralAgent(object): - def __init__(self, q_network, epsilon_start, epsilon_min, - epsilon_decay, replay_memory_size, exp_pref, - replay_start_size, update_frequency, rng): +class NeuralAgent(AgentBase): + def __init__(self, parameters): + super(NeuralAgent, self).__init__(parameters) - self.network = q_network - self.epsilon_start = epsilon_start - self.epsilon_min = epsilon_min - self.epsilon_decay = epsilon_decay - self.replay_memory_size = replay_memory_size - self.exp_pref = exp_pref - self.replay_start_size = replay_start_size - self.update_frequency = update_frequency - self.rng = rng + self.parameters = parameters + self.network = None + self.action_set = None + self.num_actions = -1 - self.phi_length = self.network.num_frames - self.image_width = self.network.input_width - self.image_height = self.network.input_height - - # CREATE A FOLDER TO HOLD RESULTS - time_str = time.strftime("_%m-%d-%H-%M_", time.gmtime()) - self.exp_dir = self.exp_pref + time_str + \ - "{}".format(self.network.lr).replace(".", "p") + "_" \ - + "{}".format(self.network.discount).replace(".", "p") - - try: - os.stat(self.exp_dir) - except OSError: - os.makedirs(self.exp_dir) - - self.num_actions = self.network.num_actions + self.epsilon_start = self.parameters.epsilon_start + self.epsilon_min = self.parameters.epsilon_min + self.epsilon_decay = self.parameters.epsilon_decay + self.replay_memory_size = self.parameters.replay_memory_size + self.exp_pref = self.parameters.experiment_prefix + self.replay_start_size = self.parameters.replay_start_size + self.update_frequency = self.parameters.update_frequency + self.phi_length = self.parameters.phi_length + self.image_width = self.parameters.resized_width + self.image_height = self.parameters.resized_height + self.rng = self.parameters.rng self.data_set = ale_data_set.DataSet(width=self.image_width, height=self.image_height, @@ -73,9 +63,10 @@ def __init__(self, q_network, epsilon_start, epsilon_min, self.testing = False + self.current_epoch = 0 self.episode_counter = 0 self.batch_counter = 0 - + self.total_reward = 0 self.holdout_data = None # In order to add an element to the data set we need the @@ -87,6 +78,25 @@ def __init__(self, q_network, epsilon_start, epsilon_min, self.export_dir = self._create_export_dir() self._open_results_file() self._open_learning_file() + + def initialize(self, action_set): + self.action_set = action_set + self.num_actions = len(self.action_set) + + if self.parameters.qlearner_type is None: + raise Exception("The QLearner/network type has not been specified") + + if self.parameters.nn_file is None: + self.network = self.parameters.qlearner_type(self.num_actions, + self.parameters.resized_width, + self.parameters.resized_height, + self.parameters.phi_length, + self.parameters) + + else: + handle = open(self.parameters.nn_file, 'r') + self.network = cPickle.load(handle) + # region Dumping/Logging def _create_export_dir(self): # CREATE A FOLDER TO HOLD RESULTS @@ -132,6 +142,10 @@ def _persist_network(self, network_filename): cPickle.dump(self.network, net_file, -1) # endregion + + def start_epoch(self, epoch): + self.current_epoch = epoch + def start_episode(self, observation): """ This method is called once at the beginning of each episode. @@ -286,7 +300,7 @@ def finish_epoch(self, epoch): network_filename = 'network_file_' + str(epoch) + '.pkl' self._persist_network(network_filename) - def start_testing(self): + def start_testing(self, epoch): self.testing = True self.total_reward = 0 self.episode_counter = 0 diff --git a/deep_q_rl/ale_agent_base.py b/deep_q_rl/ale_agent_base.py new file mode 100644 index 0000000..c94ab11 --- /dev/null +++ b/deep_q_rl/ale_agent_base.py @@ -0,0 +1,66 @@ +from abc import ABCMeta, abstractmethod + + +class AgentBase(object): + __metaclass__ = ABCMeta + + def __init__(self, parameters): + pass + + @abstractmethod + def initialize(self, action_set): + pass + + @abstractmethod + def start_episode(self, observation): + """ + This method is called once at the beginning of each episode. + No reward is provided, because reward is only available after + an action has been taken. + Arguments: + observation - height x width numpy array + Returns: + An integer action + """ + pass + + @abstractmethod + def step(self, reward, observation): + """ + This method is called each time step. + Arguments: + reward - Real valued reward. + observation - A height x width numpy array + Returns: + An integer action. + """ + pass + + @abstractmethod + def end_episode(self, reward, terminal): + """ + This function is called once at the end of an episode. + Arguments: + reward - Real valued reward. + terminal - Whether the episode ended intrinsically + (ie we didn't run out of steps) + Returns: + None + """ + pass + + @abstractmethod + def start_epoch(self, epoch): + pass + + @abstractmethod + def finish_epoch(self, epoch): + pass + + @abstractmethod + def start_testing(self, epoch): + pass + + @abstractmethod + def finish_testing(self, epoch): + pass \ No newline at end of file diff --git a/deep_q_rl/ale_agent_random.py b/deep_q_rl/ale_agent_random.py new file mode 100644 index 0000000..4602596 --- /dev/null +++ b/deep_q_rl/ale_agent_random.py @@ -0,0 +1,31 @@ +import random +from ale_agent_base import AgentBase + +class AgentRandom(AgentBase): + def __init__(self, parameters): + super(AgentRandom, self).__init__(parameters) + self.action_set = None + + def initialize(self, action_set): + self.action_set = action_set + + def start_episode(self, observation): + return self.step(None, None) + + def step(self, reward, observation): + return random.randint(0, len(self.action_set)-1) + + def end_episode(self, reward, terminal): + pass + + def start_epoch(self, epoch): + pass + + def finish_epoch(self, epoch): + pass + + def start_testing(self, epoch): + pass + + def finish_testing(self, epoch): + pass \ No newline at end of file diff --git a/deep_q_rl/ale_experiment.py b/deep_q_rl/ale_experiment.py old mode 100755 new mode 100644 index f5e4e72..711ee41 --- a/deep_q_rl/ale_experiment.py +++ b/deep_q_rl/ale_experiment.py @@ -48,12 +48,16 @@ def run(self): Run the desired number of training epochs, a testing epoch is conducted after each training epoch. """ + + self.agent.initialize(self.ale.getMinimalActionSet()) + for epoch in range(1, self.num_epochs + 1): + self.agent.start_epoch(epoch) self.run_epoch(epoch, self.epoch_length) self.agent.finish_epoch(epoch) if self.test_length > 0: - self.agent.start_testing() + self.agent.start_testing(epoch) self.run_epoch(epoch, self.test_length, True) self.agent.finish_testing(epoch) diff --git a/deep_q_rl/ale_parameters_default.py b/deep_q_rl/ale_parameters_default.py new file mode 100644 index 0000000..38247aa --- /dev/null +++ b/deep_q_rl/ale_parameters_default.py @@ -0,0 +1,3 @@ +class ParametersDefault(object): + # Anything that should be always set + pass diff --git a/deep_q_rl/launcher.py b/deep_q_rl/launcher.py index c136f01..f708cfc 100755 --- a/deep_q_rl/launcher.py +++ b/deep_q_rl/launcher.py @@ -4,17 +4,15 @@ run_nips.py or run_nature.py. """ +from inspect import ismethod import os import argparse import logging import ale_python_interface -import cPickle import numpy as np import theano - import ale_experiment -import ale_agent -import q_network + def process_args(args, defaults, description): """ @@ -160,8 +158,15 @@ def process_args(args, defaults, description): parameters.freeze_interval = (parameters.freeze_interval // parameters.update_frequency) - return parameters + # Get default parameters and apply to the parameters namespace when missing + defaults_dict = dict((key.lower(), value) for key, value in defaults.__dict__.iteritems() + if not ismethod(value) and not key.startswith('__')) + for k in defaults_dict: + if not hasattr(parameters, k): + setattr(parameters, k, defaults_dict[k]) + + return parameters def launch(args, defaults, description): @@ -179,15 +184,15 @@ def launch(args, defaults, description): full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: - rng = np.random.RandomState(123456) + parameters.rng = np.random.RandomState(123456) else: - rng = np.random.RandomState() + parameters.rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() - ale.setInt('random_seed', rng.randint(1000)) + ale.setInt('random_seed', parameters.rng.randint(1000)) if parameters.display_screen: import sys @@ -202,42 +207,13 @@ def launch(args, defaults, description): ale.loadROM(full_rom_path) - num_actions = len(ale.getMinimalActionSet()) - - if parameters.nn_file is None: - network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, - defaults.RESIZED_HEIGHT, - num_actions, - parameters.phi_length, - parameters.discount, - parameters.learning_rate, - parameters.rms_decay, - parameters.rms_epsilon, - parameters.momentum, - parameters.clip_delta, - parameters.freeze_interval, - parameters.batch_size, - parameters.network_type, - parameters.update_rule, - parameters.batch_accumulator, - rng) - else: - handle = open(parameters.nn_file, 'r') - network = cPickle.load(handle) - - agent = ale_agent.NeuralAgent(network, - parameters.epsilon_start, - parameters.epsilon_min, - parameters.epsilon_decay, - parameters.replay_memory_size, - parameters.experiment_prefix, - parameters.replay_start_size, - parameters.update_frequency, - rng) + if parameters.agent_type is None: + raise Exception("The agent type has not been specified") + agent = parameters.agent_type(parameters) experiment = ale_experiment.ALEExperiment(ale, agent, - defaults.RESIZED_WIDTH, - defaults.RESIZED_HEIGHT, + parameters.resized_width, + parameters.resized_height, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, @@ -245,8 +221,7 @@ def launch(args, defaults, description): parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, - rng) - + parameters.rng) experiment.run() diff --git a/deep_q_rl/q_learner.py b/deep_q_rl/q_learner.py new file mode 100644 index 0000000..3654f4f --- /dev/null +++ b/deep_q_rl/q_learner.py @@ -0,0 +1,30 @@ +from abc import ABCMeta, abstractmethod + + +class QLearner: + __metaclass__ = ABCMeta + + def __init__(self, + num_actions, + input_width, input_height, num_frames, + parameters): + pass + + @abstractmethod + def train(self, states, actions, rewards, next_states, terminals): + """ + Train one batch. + Arguments: + states - b x f x h x w numpy array, where b is batch size, + f is num frames, h is height and w is width. + actions - b x 1 numpy array of integers + rewards - b x 1 numpy array + next_states - b x f x h x w numpy array + terminals - b x 1 numpy boolean array (currently ignored) + Returns: average loss + """ + pass + + @abstractmethod + def q_vals(self, state): + pass diff --git a/deep_q_rl/q_network.py b/deep_q_rl/q_network.py index 489c09b..39b6b25 100644 --- a/deep_q_rl/q_network.py +++ b/deep_q_rl/q_network.py @@ -13,49 +13,58 @@ Author of Lasagne port: Nissan Pow Modifications: Nathan Sprague """ + import lasagne import numpy as np import theano import theano.tensor as T + +from q_learner import QLearner from updates import deepmind_rmsprop -class DeepQLearner: +class DeepQLearner(QLearner): """ Deep Q-learning network using Lasagne. """ - def __init__(self, input_width, input_height, num_actions, - num_frames, discount, learning_rate, rho, - rms_epsilon, momentum, clip_delta, freeze_interval, - batch_size, network_type, update_rule, - batch_accumulator, rng, input_scale=255.0): + def __init__(self, + num_actions, + input_width, input_height, num_frames, + parameters): + super(DeepQLearner, self).__init__(num_actions, + input_width, input_height, num_frames, + parameters) + + self.num_actions = num_actions + self.parameters = parameters self.input_width = input_width self.input_height = input_height - self.num_actions = num_actions self.num_frames = num_frames - self.batch_size = batch_size - self.discount = discount - self.rho = rho - self.lr = learning_rate - self.rms_epsilon = rms_epsilon - self.momentum = momentum - self.clip_delta = clip_delta - self.freeze_interval = freeze_interval - self.rng = rng + self.discount = self.parameters.discount + self.rho = self.parameters.rms_decay + self.lr = self.parameters.learning_rate + self.rms_epsilon = self.parameters.rms_epsilon + self.momentum = self.parameters.momentum + self.clip_delta = self.parameters.clip_delta + self.freeze_interval = self.parameters.freeze_interval + self.batch_size = self.parameters.batch_size + self.update_rule = self.parameters.update_rule + self.batch_accumulator = self.parameters.batch_accumulator + + self.rng = parameters.rng lasagne.random.set_rng(self.rng) + self.network_type = self.parameters.network_type + self.update_counter = 0 - self.l_out = self.build_network(network_type, input_width, input_height, - num_actions, num_frames, batch_size) + self.l_out = self._build_network() if self.freeze_interval > 0: - self.next_l_out = self.build_network(network_type, input_width, - input_height, num_actions, - num_frames, batch_size) - self.reset_q_hat() + self.next_l_out = self._build_network() + self._reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') @@ -64,50 +73,50 @@ def __init__(self, input_width, input_height, num_actions, terminals = T.icol('terminals') self.states_shared = theano.shared( - np.zeros((batch_size, num_frames, input_height, input_width), + np.zeros((self.batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( - np.zeros((batch_size, num_frames, input_height, input_width), + np.zeros((self.batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( - np.zeros((batch_size, 1), dtype=theano.config.floatX), + np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( - np.zeros((batch_size, 1), dtype='int32'), + np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( - np.zeros((batch_size, 1), dtype='int32'), + np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) - q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) - + q_vals = lasagne.layers.get_output(self.l_out, states / self.parameters.input_scale) + if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, - next_states / input_scale) + next_states / self.parameters.input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, - next_states / input_scale) + next_states / self.parameters.input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) - diff = target - q_vals[T.arange(batch_size), + diff = target - q_vals[T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) - if batch_accumulator == 'sum': + if self.batch_accumulator == 'sum': loss = T.sum(diff ** 2) - elif batch_accumulator == 'mean': + elif self.batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: - raise ValueError("Bad accumulator: {}".format(batch_accumulator)) + raise ValueError("Bad accumulator: {}".format(self.batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { @@ -117,16 +126,17 @@ def __init__(self, input_width, input_height, num_actions, actions: self.actions_shared, terminals: self.terminals_shared } - if update_rule == 'deepmind_rmsprop': + + if self.update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) - elif update_rule == 'rmsprop': + elif self.update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) - elif update_rule == 'sgd': + elif self.update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: - raise ValueError("Unrecognized update: {}".format(update_rule)) + raise ValueError("Unrecognized update: {}".format(self.update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, @@ -137,29 +147,24 @@ def __init__(self, input_width, input_height, num_actions, self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}) - def build_network(self, network_type, input_width, input_height, - output_dim, num_frames, batch_size): - if network_type == "nature_cuda": - return self.build_nature_network(input_width, input_height, - output_dim, num_frames, batch_size) - if network_type == "nature_dnn": - return self.build_nature_network_dnn(input_width, input_height, - output_dim, num_frames, - batch_size) - elif network_type == "nips_cuda": - return self.build_nips_network(input_width, input_height, - output_dim, num_frames, batch_size) - elif network_type == "nips_dnn": - return self.build_nips_network_dnn(input_width, input_height, - output_dim, num_frames, - batch_size) - elif network_type == "linear": - return self.build_linear_network(input_width, input_height, - output_dim, num_frames, batch_size) + def _build_network(self): + if self.network_type == "nature_cuda": + return self._build_nature_network(self.input_width, self.input_height, + self.num_actions, self.num_frames, self.batch_size) + if self.network_type == "nature_dnn": + return self._build_nature_network_dnn(self.input_width, self.input_height, + self.num_actions, self.num_frames, self.batch_size) + elif self.network_type == "nips_cuda": + return self._build_nips_network(self.input_width, self.input_height, + self.num_actions, self.num_frames, self.batch_size) + elif self.network_type == "nips_dnn": + return self._build_nips_network_dnn(self.input_width, self.input_height, + self.num_actions, self.num_frames, self.batch_size) + elif self.network_type == "linear": + return self._build_linear_network(self.input_width, self.input_height, + self.num_actions, self.num_frames, self.batch_size) else: - raise ValueError("Unrecognized network: {}".format(network_type)) - - + raise ValueError("Unrecognized network: {}".format(self.network_type)) def train(self, states, actions, rewards, next_states, terminals): """ @@ -184,7 +189,7 @@ def train(self, states, actions, rewards, next_states, terminals): self.terminals_shared.set_value(terminals) if (self.freeze_interval > 0 and self.update_counter % self.freeze_interval == 0): - self.reset_q_hat() + self._reset_q_hat() loss, _ = self._train() self.update_counter += 1 return np.sqrt(loss) @@ -202,11 +207,11 @@ def choose_action(self, state, epsilon): q_vals = self.q_vals(state) return np.argmax(q_vals) - def reset_q_hat(self): + def _reset_q_hat(self): all_params = lasagne.layers.helper.get_all_param_values(self.l_out) lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params) - def build_nature_network(self, input_width, input_height, output_dim, + def _build_nature_network(self, input_width, input_height, output_dim, num_frames, batch_size): """ Build a large network consistent with the DeepMind Nature paper. @@ -269,7 +274,7 @@ def build_nature_network(self, input_width, input_height, output_dim, return l_out - def build_nature_network_dnn(self, input_width, input_height, output_dim, + def _build_nature_network_dnn(self, input_width, input_height, output_dim, num_frames, batch_size): """ Build a large network consistent with the DeepMind Nature paper. @@ -330,7 +335,7 @@ def build_nature_network_dnn(self, input_width, input_height, output_dim, - def build_nips_network(self, input_width, input_height, output_dim, + def _build_nips_network(self, input_width, input_height, output_dim, num_frames, batch_size): """ Build a network consistent with the 2013 NIPS paper. @@ -385,7 +390,7 @@ def build_nips_network(self, input_width, input_height, output_dim, return l_out - def build_nips_network_dnn(self, input_width, input_height, output_dim, + def _build_nips_network_dnn(self, input_width, input_height, output_dim, num_frames, batch_size): """ Build a network consistent with the 2013 NIPS paper. @@ -441,7 +446,7 @@ def build_nips_network_dnn(self, input_width, input_height, output_dim, return l_out - def build_linear_network(self, input_width, input_height, output_dim, + def _build_linear_network(self, input_width, input_height, output_dim, num_frames, batch_size): """ Build a simple linear learner. Useful for creating @@ -461,11 +466,3 @@ def build_linear_network(self, input_width, input_height, output_dim, ) return l_out - -def main(): - net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000, - 32, 'nature_cuda') - - -if __name__ == '__main__': - main() diff --git a/deep_q_rl/run_nature.py b/deep_q_rl/run_nature.py index 2da46bc..bceeefe 100755 --- a/deep_q_rl/run_nature.py +++ b/deep_q_rl/run_nature.py @@ -7,11 +7,14 @@ Nature, 518(7540):529-533, February 2015 """ +import sys +from ale_agent import NeuralAgent +from ale_parameters_default import ParametersDefault +from q_network import DeepQLearner import launcher -import sys -class Defaults: +class Parameters(ParametersDefault): # ---------------------- # Experiment Parameters # ---------------------- @@ -52,6 +55,7 @@ class Defaults: BATCH_SIZE = 32 NETWORK_TYPE = "nature_dnn" FREEZE_INTERVAL = 10000 + INPUT_SCALE = 255. REPLAY_START_SIZE = 50000 RESIZE_METHOD = 'scale' RESIZED_WIDTH = 84 @@ -61,5 +65,8 @@ class Defaults: DETERMINISTIC = True CUDNN_DETERMINISTIC = False + AGENT_TYPE = NeuralAgent + QLEARNER_TYPE = DeepQLearner + if __name__ == "__main__": - launcher.launch(sys.argv[1:], Defaults, __doc__) + launcher.launch(sys.argv[1:], Parameters, __doc__) diff --git a/deep_q_rl/run_nips.py b/deep_q_rl/run_nips.py index 8a6ddfc..edc62ca 100755 --- a/deep_q_rl/run_nips.py +++ b/deep_q_rl/run_nips.py @@ -7,11 +7,14 @@ NIPS Deep Learning Workshop 2013 """ +from ale_agent import NeuralAgent +from ale_parameters_default import ParametersDefault +from q_network import DeepQLearner import launcher import sys -class Defaults: +class Parameters(ParametersDefault): # ---------------------- # Experiment Parameters # ---------------------- @@ -47,6 +50,7 @@ class Defaults: BATCH_SIZE = 32 NETWORK_TYPE = "nips_dnn" FREEZE_INTERVAL = -1 + INPUT_SCALE = 255. REPLAY_START_SIZE = 100 RESIZE_METHOD = 'crop' RESIZED_WIDTH = 84 @@ -56,5 +60,8 @@ class Defaults: DETERMINISTIC = True CUDNN_DETERMINISTIC = False + AGENT_TYPE = NeuralAgent + QLEARNER_TYPE = DeepQLearner + if __name__ == "__main__": - launcher.launch(sys.argv[1:], Defaults, __doc__) + launcher.launch(sys.argv[1:], Parameters, __doc__) diff --git a/deep_q_rl/run_random.py b/deep_q_rl/run_random.py new file mode 100755 index 0000000..a234830 --- /dev/null +++ b/deep_q_rl/run_random.py @@ -0,0 +1,66 @@ +#! /usr/bin/env python +""" +Execute a training run of using an agent that plays random moves + +""" +import sys +from ale_agent_random import AgentRandom + +from ale_parameters_default import ParametersDefault +import launcher + + +class Parameters(ParametersDefault): + # ---------------------- + # Experiment Parameters + # ---------------------- + STEPS_PER_EPOCH = 250000 + EPOCHS = 200 + STEPS_PER_TEST = 125000 + + # ---------------------- + # ALE Parameters + # ---------------------- + BASE_ROM_PATH = "../roms/" + ROM = 'breakout.bin' + FRAME_SKIP = 4 + REPEAT_ACTION_PROBABILITY = 0 + + # ---------------------- + # Agent/Network parameters: + # ---------------------- + UPDATE_RULE = 'deepmind_rmsprop' + BATCH_ACCUMULATOR = 'sum' + LEARNING_RATE = .00025 + DISCOUNT = .99 + RMS_DECAY = .95 # (Rho) + RMS_EPSILON = .01 + MOMENTUM = 0 # Note that the "momentum" value mentioned in the Nature + # paper is not used in the same way as a traditional momentum + # term. It is used to track gradient for the purpose of + # estimating the standard deviation. This package uses + # rho/RMS_DECAY to track both the history of the gradient + # and the squared gradient. + CLIP_DELTA = 1.0 + EPSILON_START = 1.0 + EPSILON_MIN = .1 + EPSILON_DECAY = 1000000 + PHI_LENGTH = 4 + UPDATE_FREQUENCY = 4 + REPLAY_MEMORY_SIZE = 1000000 + BATCH_SIZE = 32 + NETWORK_TYPE = "nature_dnn" + FREEZE_INTERVAL = 10000 + INPUT_SCALE = 255. + REPLAY_START_SIZE = 50000 + RESIZE_METHOD = 'scale' + RESIZED_WIDTH = 84 + RESIZED_HEIGHT = 84 + DEATH_ENDS_EPISODE = 'true' + MAX_START_NULLOPS = 30 + + AGENT_TYPE = AgentRandom + QLEARNER_TYPE = None + +if __name__ == "__main__": + launcher.launch(sys.argv[1:], Parameters, __doc__) diff --git a/deep_q_rl/test/test_q_network.py b/deep_q_rl/test/test_q_network.py index 82cd142..8e45b6b 100644 --- a/deep_q_rl/test/test_q_network.py +++ b/deep_q_rl/test/test_q_network.py @@ -7,9 +7,11 @@ import unittest import numpy.testing import lasagne +from deep_q_rl.ale_parameters_default import ParametersDefault import deep_q_rl.q_network as q_network + class ChainMDP(object): """Simple markov chain style MDP. Three "rooms" and one absorbing state. States are encoded for the q_network as arrays with @@ -52,7 +54,7 @@ def act(self, state, action_index): """ action 0 is left, 1 is right. """ - state_index = np.nonzero(state[0, 0, 0, :])[0][0] + state_index = np.nonzero(state[0, 0, 0, :])[0][0] next_index = state_index if np.random.random() < self.success_prob: @@ -80,16 +82,27 @@ class LinearTests(unittest.TestCase): Q-learning code operates as good-ol-fashioned Q-learning. These tests check that the basic updates code is working correctly. """ + def setUp(self): # Divide the desired learning rate by two, because loss is # defined as L^2, not 1/2 L^2. - self.learning_rate = .1 / 2.0 + self.parameters = ParametersDefault() + self.parameters.discount = .5 + self.parameters.learning_rate = .1 / 2.0 + self.parameters.rms_decay = 0 + self.parameters.rms_epsilon = 0 + self.parameters.momentum = 0 + self.parameters.clip_delta = 0 + self.parameters.freeze_interval = 0 + self.parameters.batch_size = 1 + self.parameters.network_type = 'linear' + self.parameters.update_rule = 'sgd' + self.parameters.batch_accumulator = 'sum' + self.parameters.input_scale = 1.0 - self.discount = .5 self.mdp = ChainMDP() - def all_q_vals(self, net): """ Helper method to get the entire Q-table """ @@ -101,7 +114,7 @@ def all_q_vals(self, net): def train(self, net, steps): mdp = self.mdp for _ in range(steps): - state = mdp.states[np.random.randint(0, mdp.num_states-1)] + state = mdp.states[np.random.randint(0, mdp.num_states - 1)] action_index = np.random.randint(0, mdp.num_actions) reward, next_state, terminal = mdp.act(state, action_index) @@ -109,13 +122,11 @@ def train(self, net, steps): terminal) def test_updates_sgd_no_freeze(self): - freeze_interval = -1 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.parameters.freeze_interval = -1 + + net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, + num_actions=self.mdp.num_actions, num_frames=1, + parameters=self.parameters) mdp = self.mdp @@ -150,16 +161,11 @@ def test_updates_sgd_no_freeze(self): [[.07, 0], [0.0035, 0], [0, .1], [0, 0]]) - def test_convergence_sgd_no_freeze(self): - freeze_interval = -1 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) - + self.parameters.freeze_interval = -1 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, + num_actions=self.mdp.num_actions, num_frames=1, + parameters=self.parameters) self.train(net, 1000) @@ -167,44 +173,34 @@ def test_convergence_sgd_no_freeze(self): [[.7, .25], [.35, .5], [.25, 1.0], [0., 0.]], 3) - def test_convergence_random_initialization(self): """ This test will only pass if terminal states are handled correctly. Otherwise the random initialization of the value of the terminal state will propagate back. """ - freeze_interval = -1 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.parameters.freeze_interval = -1 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, + num_actions=self.mdp.num_actions, num_frames=1, + parameters=self.parameters) # Randomize initial q-values: params = lasagne.layers.helper.get_all_param_values(net.l_out) rand = np.random.random(params[0].shape) - rand = numpy.array(rand, dtype=theano.config.floatX) + rand = numpy.array(rand, dtype=theano.config.floatX) lasagne.layers.helper.set_all_param_values(net.l_out, [rand]) self.train(net, 1000) - numpy.testing.assert_almost_equal(self.all_q_vals(net)[0:3,:], + numpy.testing.assert_almost_equal(self.all_q_vals(net)[0:3, :], [[.7, .25], [.35, .5], [.25, 1.0]], 3) - - - def test_convergence_sgd_permanent_freeze(self): - freeze_interval = 1000000 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.parameters.freeze_interval = 1000000 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, + num_actions=self.mdp.num_actions, num_frames=1, + parameters=self.parameters) self.train(net, 1000) @@ -213,13 +209,10 @@ def test_convergence_sgd_permanent_freeze(self): [0, 1.0], [0., 0.]], 3) def test_convergence_sgd_frequent_freeze(self): - freeze_interval = 2 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.parameters.freeze_interval = 2 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, + num_actions=self.mdp.num_actions, num_frames=1, + parameters=self.parameters) self.train(net, 1000) @@ -228,19 +221,17 @@ def test_convergence_sgd_frequent_freeze(self): [.25, 1.0], [0., 0.]], 3) def test_convergence_sgd_one_freeze(self): - freeze_interval = 500 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.parameters.freeze_interval = 500 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, + num_actions=self.mdp.num_actions, num_frames=1, + parameters=self.parameters) - self.train(net, freeze_interval * 2) + self.train(net, self.parameters.freeze_interval * 2) numpy.testing.assert_almost_equal(self.all_q_vals(net), [[.7, 0], [.35, .5], [0, 1.0], [0., 0.]], 3) + if __name__ == "__main__": unittest.main() From 79824c2b8e1931269b5105b606c66629a8cb7e72 Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Mon, 17 Aug 2015 17:24:51 +0200 Subject: [PATCH 6/7] separate step method functionality (testing/learning) Make it simpler to override them independently --- deep_q_rl/ale_agent.py | 65 +++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/deep_q_rl/ale_agent.py b/deep_q_rl/ale_agent.py index ae735ef..426917b 100644 --- a/deep_q_rl/ale_agent.py +++ b/deep_q_rl/ale_agent.py @@ -44,14 +44,14 @@ def __init__(self, parameters): self.data_set = ale_data_set.DataSet(width=self.image_width, height=self.image_height, - rng=rng, + rng=self.rng, max_steps=self.replay_memory_size, phi_length=self.phi_length) # just needs to be big enough to create phi's self.test_data_set = ale_data_set.DataSet(width=self.image_width, height=self.image_height, - rng=rng, + rng=self.rng, max_steps=self.phi_length * 2, phi_length=self.phi_length) self.epsilon = self.epsilon_start @@ -187,6 +187,35 @@ def _show_phis(self, phi1, phi2): plt.grid(color='r', linestyle='-', linewidth=1) plt.show() + def _step_testing(self, reward, observation): + action = self._choose_action(data_set=self.test_data_set, + epsilon=.05, + cur_img=observation, + reward=np.clip(reward, -1, 1)) + return action + + def _step_training(self, reward, observation): + if len(self.data_set) > self.replay_start_size: + self.epsilon = max(self.epsilon_min, + self.epsilon - self.epsilon_rate) + + action = self._choose_action(data_set=self.data_set, + epsilon=self.epsilon, + cur_img=observation, + reward=np.clip(reward, -1, 1)) + + if self.step_counter % self.update_frequency == 0: + loss = self._do_training() + self.batch_counter += 1 + self.loss_averages.append(loss) + + else: # Still gathering initial random data... + action = self._choose_action(data_set=self.data_set, + epsilon=self.epsilon, + cur_img=observation, + reward=np.clip(reward, -1, 1)) + return action + def step(self, reward, observation): """ This method is called each time step. @@ -199,37 +228,13 @@ def step(self, reward, observation): An integer action. """ - - self.step_counter += 1 - - #TESTING--------------------------- + self.episode_reward += reward if self.testing: - self.episode_reward += reward - action = self._choose_action(self.test_data_set, .05, - observation, np.clip(reward, -1, 1)) - - #NOT TESTING--------------------------- + action = self._step_testing(reward, observation) else: + action = self._step_training(reward, observation) - if len(self.data_set) > self.replay_start_size: - self.epsilon = max(self.epsilon_min, - self.epsilon - self.epsilon_rate) - - action = self._choose_action(self.data_set, self.epsilon, - observation, - np.clip(reward, -1, 1)) - - if self.step_counter % self.update_frequency == 0: - loss = self._do_training() - self.batch_counter += 1 - self.loss_averages.append(loss) - - else: # Still gathering initial random data... - action = self._choose_action(self.data_set, self.epsilon, - observation, - np.clip(reward, -1, 1)) - - + self.step_counter += 1 self.last_action = action self.last_img = observation From 4cdc3b62884001e739ed9fa10f9455069c0eb72d Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Thu, 20 Aug 2015 00:52:13 +0200 Subject: [PATCH 7/7] Keeping line-length below 80 + fixing warnings --- deep_q_rl/ale_agent.py | 73 +++++++++-------- deep_q_rl/ale_agent_base.py | 2 +- deep_q_rl/ale_agent_random.py | 9 ++- deep_q_rl/ale_data_set.py | 3 +- deep_q_rl/ale_experiment.py | 19 +++-- deep_q_rl/ale_run_watch.py | 4 +- deep_q_rl/launcher.py | 100 ++++++++++++----------- deep_q_rl/plot_filters.py | 4 +- deep_q_rl/q_learner.py | 2 +- deep_q_rl/q_network.py | 135 ++++++++++++++++++------------- deep_q_rl/run_nature.py | 1 + deep_q_rl/run_nips.py | 1 + deep_q_rl/test/test_q_network.py | 90 ++++++++++++--------- 13 files changed, 248 insertions(+), 195 deletions(-) diff --git a/deep_q_rl/ale_agent.py b/deep_q_rl/ale_agent.py index 426917b..46b1ec2 100644 --- a/deep_q_rl/ale_agent.py +++ b/deep_q_rl/ale_agent.py @@ -17,30 +17,31 @@ import ale_data_set import sys + sys.setrecursionlimit(10000) class NeuralAgent(AgentBase): - def __init__(self, parameters): - super(NeuralAgent, self).__init__(parameters) + def __init__(self, params): + super(NeuralAgent, self).__init__(params) - self.parameters = parameters + self.params = params self.network = None self.action_set = None self.num_actions = -1 - self.epsilon_start = self.parameters.epsilon_start - self.epsilon_min = self.parameters.epsilon_min - self.epsilon_decay = self.parameters.epsilon_decay - self.replay_memory_size = self.parameters.replay_memory_size - self.exp_pref = self.parameters.experiment_prefix - self.replay_start_size = self.parameters.replay_start_size - self.update_frequency = self.parameters.update_frequency - self.phi_length = self.parameters.phi_length - self.image_width = self.parameters.resized_width - self.image_height = self.parameters.resized_height + self.epsilon_start = self.params.epsilon_start + self.epsilon_min = self.params.epsilon_min + self.epsilon_decay = self.params.epsilon_decay + self.replay_memory_size = self.params.replay_memory_size + self.exp_pref = self.params.experiment_prefix + self.replay_start_size = self.params.replay_start_size + self.update_frequency = self.params.update_frequency + self.phi_length = self.params.phi_length + self.image_width = self.params.resized_width + self.image_height = self.params.resized_height - self.rng = self.parameters.rng + self.rng = self.params.rng self.data_set = ale_data_set.DataSet(width=self.image_width, height=self.image_height, @@ -83,18 +84,18 @@ def initialize(self, action_set): self.action_set = action_set self.num_actions = len(self.action_set) - if self.parameters.qlearner_type is None: + if self.params.qlearner_type is None: raise Exception("The QLearner/network type has not been specified") - if self.parameters.nn_file is None: - self.network = self.parameters.qlearner_type(self.num_actions, - self.parameters.resized_width, - self.parameters.resized_height, - self.parameters.phi_length, - self.parameters) - + if self.params.nn_file is None: + self.network = self.params.qlearner_type( + num_actions=self.num_actions, + input_width=self.params.resized_width, + input_height=self.params.resized_height, + num_frames=self.params.phi_length, + params=self.params) else: - handle = open(self.parameters.nn_file, 'r') + handle = open(self.params.nn_file, 'r') self.network = cPickle.load(handle) # region Dumping/Logging @@ -102,8 +103,9 @@ def _create_export_dir(self): # CREATE A FOLDER TO HOLD RESULTS time_str = time.strftime("_%m-%d-%H-%M_", time.gmtime()) export_dir = self.exp_pref + time_str + \ - "{}".format(self.parameters.learning_rate).replace(".", "p") + "_" \ - + "{}".format(self.parameters.discount).replace(".", "p") + "{}".format(self.params.learning_rate).replace(".", "p") \ + + "_" + \ + "{}".format(self.params.discount).replace(".", "p") try: os.stat(export_dir) except OSError: @@ -114,7 +116,7 @@ def _create_export_dir(self): def _open_results_file(self): logging.info("OPENING " + self.export_dir + '/results.csv') self.results_file = open(self.export_dir + '/results.csv', 'w', 0) - self.results_file.write(\ + self.results_file.write( 'epoch,num_episodes,total_reward,reward_per_epoch,mean_q\n') self.results_file.flush() @@ -124,9 +126,11 @@ def _open_learning_file(self): self.learning_file.flush() def _update_results_file(self, epoch, num_episodes, holdout_sum): - out = "{},{},{},{},{}\n".format(epoch, num_episodes, self.total_reward, + out = "{},{},{},{},{}\n".format(epoch, num_episodes, + self.total_reward, self.total_reward / float(num_episodes), holdout_sum) + self.results_file.write(out) self.results_file.flush() @@ -174,15 +178,14 @@ def start_episode(self, observation): return return_action - def _show_phis(self, phi1, phi2): import matplotlib.pyplot as plt for p in range(self.phi_length): - plt.subplot(2, self.phi_length, p+1) + plt.subplot(2, self.phi_length, p + 1) plt.imshow(phi1[p, :, :], interpolation='none', cmap="gray") plt.grid(color='r', linestyle='-', linewidth=1) for p in range(self.phi_length): - plt.subplot(2, self.phi_length, p+5) + plt.subplot(2, self.phi_length, p + 5) plt.imshow(phi2[p, :, :], interpolation='none', cmap="gray") plt.grid(color='r', linestyle='-', linewidth=1) plt.show() @@ -209,7 +212,7 @@ def _step_training(self, reward, observation): self.batch_counter += 1 self.loss_averages.append(loss) - else: # Still gathering initial random data... + else: # Still gathering initial random data... action = self._choose_action(data_set=self.data_set, epsilon=self.epsilon, cur_img=observation, @@ -262,12 +265,11 @@ def _do_training(self): differently. """ states, actions, rewards, next_states, terminals = \ - self.data_set.random_batch( - self.network.batch_size) + self.data_set.random_batch( + self.network.batch_size) return self.network.train(states, actions, rewards, next_states, terminals) - def end_episode(self, reward, terminal=True): """ This function is called once at the end of an episode. @@ -299,7 +301,8 @@ def end_episode(self, reward, terminal=True): if self.batch_counter > 0: self._update_learning_file() - logging.info("average loss: {:.4f}".format(np.mean(self.loss_averages))) + logging.info( + "average loss: {:.4f}".format(np.mean(self.loss_averages))) def finish_epoch(self, epoch): network_filename = 'network_file_' + str(epoch) + '.pkl' diff --git a/deep_q_rl/ale_agent_base.py b/deep_q_rl/ale_agent_base.py index c94ab11..a492511 100644 --- a/deep_q_rl/ale_agent_base.py +++ b/deep_q_rl/ale_agent_base.py @@ -4,7 +4,7 @@ class AgentBase(object): __metaclass__ = ABCMeta - def __init__(self, parameters): + def __init__(self, params): pass @abstractmethod diff --git a/deep_q_rl/ale_agent_random.py b/deep_q_rl/ale_agent_random.py index 4602596..d9677f2 100644 --- a/deep_q_rl/ale_agent_random.py +++ b/deep_q_rl/ale_agent_random.py @@ -1,9 +1,10 @@ import random from ale_agent_base import AgentBase + class AgentRandom(AgentBase): - def __init__(self, parameters): - super(AgentRandom, self).__init__(parameters) + def __init__(self, params): + super(AgentRandom, self).__init__(params) self.action_set = None def initialize(self, action_set): @@ -13,7 +14,7 @@ def start_episode(self, observation): return self.step(None, None) def step(self, reward, observation): - return random.randint(0, len(self.action_set)-1) + return random.randint(0, len(self.action_set) - 1) def end_episode(self, reward, terminal): pass @@ -28,4 +29,4 @@ def start_testing(self, epoch): pass def finish_testing(self, epoch): - pass \ No newline at end of file + pass diff --git a/deep_q_rl/ale_data_set.py b/deep_q_rl/ale_data_set.py index fa96519..c17ca55 100644 --- a/deep_q_rl/ale_data_set.py +++ b/deep_q_rl/ale_data_set.py @@ -14,6 +14,7 @@ floatX = theano.config.floatX + class DataSet(object): """ Class represents a data set that stores a fixed-length history. """ @@ -34,7 +35,7 @@ def __init__(self, width, height, rng, max_steps=1000, phi_length=4, self.count = 0 self.max_steps = max_steps self.phi_length = phi_length - if capacity == None: + if capacity is None: self.capacity = max_steps + int(np.ceil(max_steps * .1)) else: self.capacity = capacity diff --git a/deep_q_rl/ale_experiment.py b/deep_q_rl/ale_experiment.py index 711ee41..b94d2a2 100644 --- a/deep_q_rl/ale_experiment.py +++ b/deep_q_rl/ale_experiment.py @@ -39,7 +39,7 @@ def __init__(self, ale, agent, resized_width, resized_height, self.height, self.width), dtype=np.uint8) - self.terminal_lol = False # Most recent episode ended on a loss of life + self.terminal_lol = False # Most recent episode ended on a loss of life self.max_start_nullops = max_start_nullops self.rng = rng @@ -72,7 +72,7 @@ def run_epoch(self, epoch, num_steps, testing=False): testing - True if this Epoch is used for testing and not training """ - self.terminal_lol = False # Make sure each epoch starts with a reset. + self.terminal_lol = False # Make sure each epoch starts with a reset. steps_left = num_steps while steps_left > 0: prefix = "testing" if testing else "training" @@ -83,8 +83,13 @@ def run_epoch(self, epoch, num_steps, testing=False): t1 = time.time() total_time = t1 - t0 - logging.info("[{:8}] epoch {:3} | num_steps {:7} steps_left {:7} steps/second: {:>7.2f}".format( - prefix, epoch, num_steps, steps_left, num_steps / total_time)) + logging.info("[{:8}] epoch {:3} | num_steps {:7} " \ + "steps_left {:7} steps/second: {:>7.2f}" + .format(prefix, + epoch, + num_steps, + steps_left, + num_steps / total_time)) def _init_episode(self): """ This method resets the game if needed, performs enough null @@ -98,14 +103,13 @@ def _init_episode(self): if self.max_start_nullops > 0: random_actions = self.rng.randint(0, self.max_start_nullops+1) for _ in range(random_actions): - self._act(0) # Null action + self._act(0) # Null action # Make sure the screen buffer is filled at the beginning of # each episode... self._act(0) self._act(0) - def _act(self, action): """Perform the indicated action for a single frame, return the resulting reward and store the resulting screen image in the @@ -188,7 +192,7 @@ def resize_image(self, image): # Crop the part we want crop_y_cutoff = resize_height - CROP_OFFSET - self.resized_height cropped = resized[crop_y_cutoff: - crop_y_cutoff + self.resized_height, :] + crop_y_cutoff + self.resized_height, :] return cropped elif self.resize_method == 'scale': @@ -197,4 +201,3 @@ def resize_image(self, image): interpolation=cv2.INTER_LINEAR) else: raise ValueError('Unrecognized image resize method.') - diff --git a/deep_q_rl/ale_run_watch.py b/deep_q_rl/ale_run_watch.py index 67a0bd5..73fd9a3 100644 --- a/deep_q_rl/ale_run_watch.py +++ b/deep_q_rl/ale_run_watch.py @@ -8,6 +8,7 @@ import subprocess import sys + def run_watch(): command = ['./run_nature.py', '--steps-per-epoch', '0', '--test-length', '10000', '--nn-file', sys.argv[1], @@ -17,8 +18,9 @@ def run_watch(): command.extend(['--rom', sys.argv[2]]) p1 = subprocess.Popen(command) - + p1.wait() + if __name__ == "__main__": run_watch() diff --git a/deep_q_rl/launcher.py b/deep_q_rl/launcher.py index f708cfc..828dc2f 100755 --- a/deep_q_rl/launcher.py +++ b/deep_q_rl/launcher.py @@ -41,11 +41,11 @@ def process_args(args, defaults, description): parser.add_argument('--experiment-prefix', dest="experiment_prefix", default=None, help='Experiment name prefix ' - '(default is the name of the game)') + '(default is the name of the game)') parser.add_argument('--frame-skip', dest="frame_skip", default=defaults.FRAME_SKIP, type=int, help='Every how many frames to process ' - '(default: %(default)s)') + '(default: %(default)s)') parser.add_argument('--repeat-action-probability', dest="repeat_action_probability", default=defaults.REPEAT_ACTION_PROBABILITY, type=float, @@ -58,7 +58,7 @@ def process_args(args, defaults, description): '(default: %(default)s)')) parser.add_argument('--batch-accumulator', dest="batch_accumulator", type=str, default=defaults.BATCH_ACCUMULATOR, - help=('sum|mean (default: %(default)s)')) + help='sum|mean (default: %(default)s)') parser.add_argument('--learning-rate', dest="learning_rate", type=float, default=defaults.LEARNING_RATE, help='Learning rate (default: %(default)s)') @@ -68,9 +68,9 @@ def process_args(args, defaults, description): parser.add_argument('--rms-epsilon', dest="rms_epsilon", type=float, default=defaults.RMS_EPSILON, help='Denominator epsilson for rms_prop ' + - '(default: %(default)s)') + '(default: %(default)s)') parser.add_argument('--momentum', type=float, default=defaults.MOMENTUM, - help=('Momentum term for Nesterov momentum. '+ + help=('Momentum term for Nesterov momentum. ' + '(default: %(default)s)')) parser.add_argument('--clip-delta', dest="clip_delta", type=float, default=defaults.CLIP_DELTA, @@ -110,7 +110,7 @@ def process_args(args, defaults, description): '(default: %(default)s)')) parser.add_argument('--update-frequency', dest="update_frequency", type=int, default=defaults.UPDATE_FREQUENCY, - help=('Number of actions before each SGD update. '+ + help=('Number of actions before each SGD update. ' + '(default: %(default)s)')) parser.add_argument('--replay-start-size', dest="replay_start_size", type=int, default=defaults.REPLAY_START_SIZE, @@ -137,36 +137,38 @@ def process_args(args, defaults, description): help=('Whether to use deterministic backprop. ' + '(default: %(default)s)')) - parameters = parser.parse_args(args) - if parameters.experiment_prefix is None: - name = os.path.splitext(os.path.basename(parameters.rom))[0] - parameters.experiment_prefix = name + params = parser.parse_args(args) + if params.experiment_prefix is None: + name = os.path.splitext(os.path.basename(params.rom))[0] + params.experiment_prefix = name - if parameters.death_ends_episode == 'true': - parameters.death_ends_episode = True - elif parameters.death_ends_episode == 'false': - parameters.death_ends_episode = False + if params.death_ends_episode == 'true': + params.death_ends_episode = True + elif params.death_ends_episode == 'false': + params.death_ends_episode = False else: raise ValueError("--death-ends-episode must be true or false") - if parameters.freeze_interval > 0: + if params.freeze_interval > 0: # This addresses an inconsistency between the Nature paper and # the Deepmind code. The paper states that the target network # update frequency is "measured in the number of parameter # updates". In the code it is actually measured in the number # of action choices. - parameters.freeze_interval = (parameters.freeze_interval // - parameters.update_frequency) + params.freeze_interval = (params.freeze_interval // + params.update_frequency) # Get default parameters and apply to the parameters namespace when missing - defaults_dict = dict((key.lower(), value) for key, value in defaults.__dict__.iteritems() - if not ismethod(value) and not key.startswith('__')) + defaults_dict = dict( + (k.lower(), v) for k, v in defaults.__dict__.iteritems() + if not ismethod(v) and not k.startswith('__') + ) for k in defaults_dict: - if not hasattr(parameters, k): - setattr(parameters, k, defaults_dict[k]) + if not hasattr(params, k): + setattr(params, k, defaults_dict[k]) - return parameters + return params def launch(args, defaults, description): @@ -175,57 +177,59 @@ def launch(args, defaults, description): """ logging.basicConfig(level=logging.INFO) - parameters = process_args(args, defaults, description) + params = process_args(args, defaults, description) - if parameters.rom.endswith('.bin'): - rom = parameters.rom + if params.rom.endswith('.bin'): + rom = params.rom else: - rom = "%s.bin" % parameters.rom + rom = "%s.bin" % params.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) - if parameters.deterministic: - parameters.rng = np.random.RandomState(123456) + if params.deterministic: + params.rng = np.random.RandomState(123456) else: - parameters.rng = np.random.RandomState() + params.rng = np.random.RandomState() - if parameters.cudnn_deterministic: + if params.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() - ale.setInt('random_seed', parameters.rng.randint(1000)) + ale.setInt('random_seed', params.rng.randint(1000)) - if parameters.display_screen: + if params.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX - ale.setBool('display_screen', parameters.display_screen) + ale.setBool('display_screen', params.display_screen) ale.setFloat('repeat_action_probability', - parameters.repeat_action_probability) + params.repeat_action_probability) ale.loadROM(full_rom_path) - if parameters.agent_type is None: + if params.agent_type is None: raise Exception("The agent type has not been specified") - agent = parameters.agent_type(parameters) - experiment = ale_experiment.ALEExperiment(ale, agent, - parameters.resized_width, - parameters.resized_height, - parameters.resize_method, - parameters.epochs, - parameters.steps_per_epoch, - parameters.steps_per_test, - parameters.frame_skip, - parameters.death_ends_episode, - parameters.max_start_nullops, - parameters.rng) + agent = params.agent_type(params) + + experiment = ale_experiment.ALEExperiment( + ale=ale, + agent=agent, + resized_width=params.resized_width, + resized_height=params.resized_height, + resize_method=params.resize_method, + num_epochs=params.epochs, + epoch_length=params.steps_per_epoch, + test_length=params.steps_per_test, + frame_skip=params.frame_skip, + death_ends_episode=params.death_ends_episode, + max_start_nullops=params.max_start_nullops, + rng=params.rng) experiment.run() - if __name__ == '__main__': pass diff --git a/deep_q_rl/plot_filters.py b/deep_q_rl/plot_filters.py index e79e199..b8b963c 100755 --- a/deep_q_rl/plot_filters.py +++ b/deep_q_rl/plot_filters.py @@ -20,8 +20,8 @@ q_layers = lasagne.layers.get_all_layers(network.l_out) w = q_layers[1].W.get_value() count = 1 -for f in range(w.shape[0]): # filters - for c in range(w.shape[1]): # channels/time-steps +for f in range(w.shape[0]): # filters + for c in range(w.shape[1]): # channels/time-steps plt.subplot(w.shape[0], w.shape[1], count) img = w[f, c, :, :] plt.imshow(img, vmin=img.min(), vmax=img.max(), diff --git a/deep_q_rl/q_learner.py b/deep_q_rl/q_learner.py index 3654f4f..7d404d8 100644 --- a/deep_q_rl/q_learner.py +++ b/deep_q_rl/q_learner.py @@ -7,7 +7,7 @@ class QLearner: def __init__(self, num_actions, input_width, input_height, num_frames, - parameters): + params): pass @abstractmethod diff --git a/deep_q_rl/q_network.py b/deep_q_rl/q_network.py index 39b6b25..e4c6fad 100644 --- a/deep_q_rl/q_network.py +++ b/deep_q_rl/q_network.py @@ -31,33 +31,36 @@ class DeepQLearner(QLearner): def __init__(self, num_actions, input_width, input_height, num_frames, - parameters): + params): super(DeepQLearner, self).__init__(num_actions, - input_width, input_height, num_frames, - parameters) + input_width, + input_height, + num_frames, + params) self.num_actions = num_actions - self.parameters = parameters + self.params = params + self.input_width = input_width self.input_height = input_height self.num_frames = num_frames - self.discount = self.parameters.discount - self.rho = self.parameters.rms_decay - self.lr = self.parameters.learning_rate - self.rms_epsilon = self.parameters.rms_epsilon - self.momentum = self.parameters.momentum - self.clip_delta = self.parameters.clip_delta - self.freeze_interval = self.parameters.freeze_interval - self.batch_size = self.parameters.batch_size - self.update_rule = self.parameters.update_rule - self.batch_accumulator = self.parameters.batch_accumulator - - self.rng = parameters.rng + self.discount = self.params.discount + self.rho = self.params.rms_decay + self.lr = self.params.learning_rate + self.rms_epsilon = self.params.rms_epsilon + self.momentum = self.params.momentum + self.clip_delta = self.params.clip_delta + self.freeze_interval = self.params.freeze_interval + self.batch_size = self.params.batch_size + self.update_rule = self.params.update_rule + self.batch_accumulator = self.params.batch_accumulator + + self.rng = params.rng lasagne.random.set_rng(self.rng) - self.network_type = self.parameters.network_type + self.network_type = self.params.network_type self.update_counter = 0 @@ -92,19 +95,26 @@ def __init__(self, np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) - q_vals = lasagne.layers.get_output(self.l_out, states / self.parameters.input_scale) + q_vals = lasagne.layers.get_output( + self.l_out, + states / self.params.input_scale) if self.freeze_interval > 0: - next_q_vals = lasagne.layers.get_output(self.next_l_out, - next_states / self.parameters.input_scale) + next_q_vals = lasagne.layers.get_output( + self.next_l_out, + next_states / self.params.input_scale) + else: - next_q_vals = lasagne.layers.get_output(self.l_out, - next_states / self.parameters.input_scale) + next_q_vals = lasagne.layers.get_output( + self.l_out, + next_states / self.params.input_scale) + next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) + diff = target - q_vals[T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) @@ -116,7 +126,8 @@ def __init__(self, elif self.batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: - raise ValueError("Bad accumulator: {}".format(self.batch_accumulator)) + raise ValueError("Bad accumulator: {}" + .format(self.batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { @@ -149,22 +160,42 @@ def __init__(self, def _build_network(self): if self.network_type == "nature_cuda": - return self._build_nature_network(self.input_width, self.input_height, - self.num_actions, self.num_frames, self.batch_size) + return self._build_nature_network(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + if self.network_type == "nature_dnn": - return self._build_nature_network_dnn(self.input_width, self.input_height, - self.num_actions, self.num_frames, self.batch_size) + return self._build_nature_network_dnn(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + elif self.network_type == "nips_cuda": - return self._build_nips_network(self.input_width, self.input_height, - self.num_actions, self.num_frames, self.batch_size) + return self._build_nips_network(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + elif self.network_type == "nips_dnn": - return self._build_nips_network_dnn(self.input_width, self.input_height, - self.num_actions, self.num_frames, self.batch_size) + return self._build_nips_network_dnn(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + elif self.network_type == "linear": - return self._build_linear_network(self.input_width, self.input_height, - self.num_actions, self.num_frames, self.batch_size) + return self._build_linear_network(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) else: - raise ValueError("Unrecognized network: {}".format(self.network_type)) + raise ValueError("Unrecognized network: {}" + .format(self.network_type)) def train(self, states, actions, rewards, next_states, terminals): """ @@ -188,7 +219,7 @@ def train(self, states, actions, rewards, next_states, terminals): self.rewards_shared.set_value(rewards) self.terminals_shared.set_value(terminals) if (self.freeze_interval > 0 and - self.update_counter % self.freeze_interval == 0): + self.update_counter % self.freeze_interval == 0): self._reset_q_hat() loss, _ = self._train() self.update_counter += 1 @@ -212,7 +243,7 @@ def _reset_q_hat(self): lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params) def _build_nature_network(self, input_width, input_height, output_dim, - num_frames, batch_size): + num_frames, batch_size): """ Build a large network consistent with the DeepMind Nature paper. """ @@ -228,7 +259,7 @@ def _build_nature_network(self, input_width, input_height, output_dim, filter_size=(8, 8), stride=(4, 4), nonlinearity=lasagne.nonlinearities.rectify, - W=lasagne.init.HeUniform(), # Defaults to Glorot + W=lasagne.init.HeUniform(), # Defaults to Glorot b=lasagne.init.Constant(.1), dimshuffle=True ) @@ -273,9 +304,8 @@ def _build_nature_network(self, input_width, input_height, output_dim, return l_out - def _build_nature_network_dnn(self, input_width, input_height, output_dim, - num_frames, batch_size): + num_frames, batch_size): """ Build a large network consistent with the DeepMind Nature paper. """ @@ -333,10 +363,8 @@ def _build_nature_network_dnn(self, input_width, input_height, output_dim, return l_out - - def _build_nips_network(self, input_width, input_height, output_dim, - num_frames, batch_size): + num_frames, batch_size): """ Build a network consistent with the 2013 NIPS paper. """ @@ -351,7 +379,7 @@ def _build_nips_network(self, input_width, input_height, output_dim, filter_size=(8, 8), stride=(4, 4), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(c01b=True), + # W=lasagne.init.HeUniform(c01b=True), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1), dimshuffle=True @@ -363,7 +391,7 @@ def _build_nips_network(self, input_width, input_height, output_dim, filter_size=(4, 4), stride=(2, 2), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(c01b=True), + # W=lasagne.init.HeUniform(c01b=True), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1), dimshuffle=True @@ -373,7 +401,7 @@ def _build_nips_network(self, input_width, input_height, output_dim, l_conv2, num_units=256, nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -382,16 +410,15 @@ def _build_nips_network(self, input_width, input_height, output_dim, l_hidden1, num_units=output_dim, nonlinearity=None, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) return l_out - def _build_nips_network_dnn(self, input_width, input_height, output_dim, - num_frames, batch_size): + num_frames, batch_size): """ Build a network consistent with the 2013 NIPS paper. """ @@ -402,14 +429,13 @@ def _build_nips_network_dnn(self, input_width, input_height, output_dim, shape=(batch_size, num_frames, input_width, input_height) ) - l_conv1 = dnn.Conv2DDNNLayer( l_in, num_filters=16, filter_size=(8, 8), stride=(4, 4), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -420,7 +446,7 @@ def _build_nips_network_dnn(self, input_width, input_height, output_dim, filter_size=(4, 4), stride=(2, 2), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -429,7 +455,7 @@ def _build_nips_network_dnn(self, input_width, input_height, output_dim, l_conv2, num_units=256, nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -438,16 +464,15 @@ def _build_nips_network_dnn(self, input_width, input_height, output_dim, l_hidden1, num_units=output_dim, nonlinearity=None, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) return l_out - def _build_linear_network(self, input_width, input_height, output_dim, - num_frames, batch_size): + num_frames, batch_size): """ Build a simple linear learner. Useful for creating tests that sanity-check the weight update code. diff --git a/deep_q_rl/run_nature.py b/deep_q_rl/run_nature.py index bceeefe..924bdeb 100755 --- a/deep_q_rl/run_nature.py +++ b/deep_q_rl/run_nature.py @@ -14,6 +14,7 @@ from q_network import DeepQLearner import launcher + class Parameters(ParametersDefault): # ---------------------- # Experiment Parameters diff --git a/deep_q_rl/run_nips.py b/deep_q_rl/run_nips.py index edc62ca..7b3c45e 100755 --- a/deep_q_rl/run_nips.py +++ b/deep_q_rl/run_nips.py @@ -14,6 +14,7 @@ import launcher import sys + class Parameters(ParametersDefault): # ---------------------- # Experiment Parameters diff --git a/deep_q_rl/test/test_q_network.py b/deep_q_rl/test/test_q_network.py index 8e45b6b..78a6bc8 100644 --- a/deep_q_rl/test/test_q_network.py +++ b/deep_q_rl/test/test_q_network.py @@ -87,19 +87,19 @@ def setUp(self): # Divide the desired learning rate by two, because loss is # defined as L^2, not 1/2 L^2. - self.parameters = ParametersDefault() - self.parameters.discount = .5 - self.parameters.learning_rate = .1 / 2.0 - self.parameters.rms_decay = 0 - self.parameters.rms_epsilon = 0 - self.parameters.momentum = 0 - self.parameters.clip_delta = 0 - self.parameters.freeze_interval = 0 - self.parameters.batch_size = 1 - self.parameters.network_type = 'linear' - self.parameters.update_rule = 'sgd' - self.parameters.batch_accumulator = 'sum' - self.parameters.input_scale = 1.0 + self.params = ParametersDefault() + self.params.discount = .5 + self.params.learning_rate = .1 / 2.0 + self.params.rms_decay = 0 + self.params.rms_epsilon = 0 + self.params.momentum = 0 + self.params.clip_delta = 0 + self.params.freeze_interval = 0 + self.params.batch_size = 1 + self.params.network_type = 'linear' + self.params.update_rule = 'sgd' + self.params.batch_accumulator = 'sum' + self.params.input_scale = 1.0 self.mdp = ChainMDP() @@ -122,11 +122,13 @@ def train(self, net, steps): terminal) def test_updates_sgd_no_freeze(self): - self.parameters.freeze_interval = -1 + self.params.freeze_interval = -1 - net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, - num_actions=self.mdp.num_actions, num_frames=1, - parameters=self.parameters) + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) mdp = self.mdp @@ -162,10 +164,12 @@ def test_updates_sgd_no_freeze(self): [0, 0]]) def test_convergence_sgd_no_freeze(self): - self.parameters.freeze_interval = -1 - net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, - num_actions=self.mdp.num_actions, num_frames=1, - parameters=self.parameters) + self.params.freeze_interval = -1 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) self.train(net, 1000) @@ -178,10 +182,12 @@ def test_convergence_random_initialization(self): correctly. Otherwise the random initialization of the value of the terminal state will propagate back. """ - self.parameters.freeze_interval = -1 - net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, - num_actions=self.mdp.num_actions, num_frames=1, - parameters=self.parameters) + self.params.freeze_interval = -1 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) # Randomize initial q-values: params = lasagne.layers.helper.get_all_param_values(net.l_out) @@ -197,10 +203,12 @@ def test_convergence_random_initialization(self): [.25, 1.0]], 3) def test_convergence_sgd_permanent_freeze(self): - self.parameters.freeze_interval = 1000000 - net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, - num_actions=self.mdp.num_actions, num_frames=1, - parameters=self.parameters) + self.params.freeze_interval = 1000000 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) self.train(net, 1000) @@ -209,10 +217,12 @@ def test_convergence_sgd_permanent_freeze(self): [0, 1.0], [0., 0.]], 3) def test_convergence_sgd_frequent_freeze(self): - self.parameters.freeze_interval = 2 - net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, - num_actions=self.mdp.num_actions, num_frames=1, - parameters=self.parameters) + self.params.freeze_interval = 2 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) self.train(net, 1000) @@ -221,12 +231,14 @@ def test_convergence_sgd_frequent_freeze(self): [.25, 1.0], [0., 0.]], 3) def test_convergence_sgd_one_freeze(self): - self.parameters.freeze_interval = 500 - net = q_network.DeepQLearner(input_width=self.mdp.num_states, input_height=1, - num_actions=self.mdp.num_actions, num_frames=1, - parameters=self.parameters) - - self.train(net, self.parameters.freeze_interval * 2) + self.params.freeze_interval = 500 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) + + self.train(net, self.params.freeze_interval * 2) numpy.testing.assert_almost_equal(self.all_q_vals(net), [[.7, 0], [.35, .5],