diff --git a/src/ggplearn/distributed/server.py b/src/ggplearn/distributed/server.py index 7accac6..e857d95 100644 --- a/src/ggplearn/distributed/server.py +++ b/src/ggplearn/distributed/server.py @@ -30,39 +30,36 @@ def default_conf(): conf.game = "breakthrough" conf.current_step = 1 - conf.score_network_size = "smaller" - conf.policy_network_size = "smaller" - - conf.generation_prefix = "v2_" - conf.store_path = os.path.join(os.environ["GGPLEARN_PATH"], "data", "breakthrough", "v2") - - conf.score_player_conf = msgdefs.PUCTPlayerConf(name="score_puct", - verbose=False, - num_of_playouts_per_iteration=32, - num_of_playouts_per_iteration_noop=0, - expand_root=5, - dirichlet_noise_alpha=0.1, - cpuct_constant_first_4=0.75, - cpuct_constant_after_4=0.75, - choose="choose_converge") - - conf.policy_player_conf = msgdefs.PUCTPlayerConf(name="policy_puct", + conf.score_network_size = "small" + conf.policy_network_size = "small" + + conf.generation_prefix = "v3_" + conf.store_path = os.path.join(os.environ["GGPLEARN_PATH"], "data", "breakthrough", "v3") + + # generation set on server + conf.player_select_conf = msgdefs.PolicyPlayerConf(verbose=False, + choose_exponential_scale=0.3) + + conf.player_policy_conf = msgdefs.PUCTPlayerConf(name="policy_puct", verbose=False, - num_of_playouts_per_iteration=200, - # num_of_playouts_per_iteration=800, - num_of_playouts_per_iteration_noop=0, - expand_root=5, + playouts_per_iteration=800, + playouts_per_iteration_noop=0, + expand_root=100, dirichlet_noise_alpha=-1, cpuct_constant_first_4=3.0, cpuct_constant_after_4=0.75, choose="choose_converge") + + conf.player_score_conf = msgdefs.PolicyPlayerConf(verbose=False, + choose_exponential_scale=-1) conf.generation_size = 32 conf.max_growth_while_training = 0.2 - conf.validation_split = 0.9 + conf.validation_split = 0.8 conf.batch_size = 32 conf.epochs = 4 conf.max_sample_count = 100000 + conf.run_post_training_cmds = [] return conf @@ -139,13 +136,10 @@ def __init__(self, conf_filename): def check_nn_generations_exist(self): score_gen = self.get_score_generation(self.conf.current_step) policy_gen = self.get_policy_generation(self.conf.current_step) + log.debug("current policy gen %s" % score_gen) log.debug("current score gen %s" % policy_gen) - # create them (will overwrite even exist) - policy_gen = self.get_policy_generation(self.conf.current_step) - score_gen = self.get_score_generation(self.conf.current_step) - for g in (policy_gen, score_gen): net = network.create(g, self.game_info, load=False) if not net.can_load(): @@ -156,10 +150,11 @@ def check_nn_generations_exist(self): critical_error("Did not find network %s. exiting." % g) def save_our_config(self, rolled=False): - if rolled: - shutil.copy(self.conf_filename, self.conf_filename + "-%00d" % (self.conf.current_step - 1)) - else: - shutil.copy(self.conf_filename, self.conf_filename + "-bak") + if os.path.exists(self.conf_filename): + if rolled: + shutil.copy(self.conf_filename, self.conf_filename + "-%00d" % (self.conf.current_step - 1)) + else: + shutil.copy(self.conf_filename, self.conf_filename + "-bak") with open(self.conf_filename, 'w') as open_file: open_file.write(attrutil.attr_to_json(self.conf, indent=4)) @@ -288,7 +283,8 @@ def new_generation(self): json.encoder.FLOAT_REPR = lambda f: ("%.5f" % f) log.info("writing json") - filename = os.path.join(self.conf.store_path, "gendata_%s.json" % self.conf.current_step) + filename = os.path.join(self.conf.store_path, "gendata_%s_%s.json" % (self.conf.game, + self.conf.current_step)) with open(filename, 'w') as open_file: open_file.write(attrutil.attr_to_json(gen, indent=4)) @@ -302,7 +298,7 @@ def new_generation(self): m.generation_prefix = self.conf.generation_prefix m.store_path = self.conf.store_path - m.use_previous = True # until we are big enough, what is the point? + m.use_previous = False # until we are big enough, what is the point? m.next_step = self.conf.current_step + 1 m.validation_split = self.conf.validation_split @@ -348,15 +344,22 @@ def roll_generation(self): self.generation = None log.warning("roll_generation() complete. We have %s samples leftover" % len(self.accumulated_samples)) + self.schedule_players() def create_approx_config(self): - conf = msgdefs.ConfigureApproxTrainer() - conf.game = self.conf.game - conf.policy_generation = self.get_policy_generation(self.conf.current_step) - conf.score_generation = self.get_score_generation(self.conf.current_step) - conf.temperature = 1.0 - conf.score_puct_player_conf = self.conf.score_player_conf - conf.policy_puct_player_conf = self.conf.policy_player_conf + # we use score_gen for select also XXX we should probably just go to one + policy_generation = self.get_policy_generation(self.conf.current_step) + score_generation = self.get_score_generation(self.conf.current_step) + + self.conf.player_select_conf.generation = score_generation + self.conf.player_policy_conf.generation = policy_generation + self.conf.player_score_conf.generation = score_generation + + conf = msgdefs.ConfigureApproxTrainer(game=self.conf.game) + conf.player_select_conf = self.conf.player_select_conf + conf.player_policy_conf = self.conf.player_policy_conf + conf.player_score_conf = self.conf.player_score_conf + self.approx_play_config = conf def schedule_players(self): @@ -395,6 +398,7 @@ def start_server_factory(): setup_once("worker") ServerBroker(sys.argv[1]) + reactor.run() diff --git a/src/ggplearn/msgdefs.py b/src/ggplearn/msgdefs.py index 121e9ae..704f091 100644 --- a/src/ggplearn/msgdefs.py +++ b/src/ggplearn/msgdefs.py @@ -67,8 +67,9 @@ class ServerConfig(object): generation_prefix = attr.ib("v2_") store_path = attr.ib("somewhere") - policy_player_conf = attr.ib(default=attr.Factory(PUCTPlayerConf)) - score_player_conf = attr.ib(default=attr.Factory(PUCTPlayerConf)) + player_select_conf = attr.ib(default=attr.Factory(PolicyPlayerConf)) + player_policy_conf = attr.ib(default=attr.Factory(PUCTPlayerConf)) + player_score_conf = attr.ib(default=attr.Factory(PolicyPlayerConf)) generation_size = attr.ib(1024) max_growth_while_training = attr.ib(0.2) @@ -106,8 +107,6 @@ class Ok(object): @attr.s class ConfigureApproxTrainer(object): game = attr.ib("breakthrough") - policy_generation = attr.ib("gen0_small") - score_generation = attr.ib("gen0_smaller") temperature = attr.ib(1.0) player_select_conf = attr.ib(default=attr.Factory(PolicyPlayerConf)) player_policy_conf = attr.ib(default=attr.Factory(PUCTPlayerConf)) diff --git a/src/ggplearn/player/policyplayer.py b/src/ggplearn/player/policyplayer.py index d68e994..2f39f14 100644 --- a/src/ggplearn/player/policyplayer.py +++ b/src/ggplearn/player/policyplayer.py @@ -23,7 +23,8 @@ def __init__(self, conf): self.bases_config = None def on_meta_gaming(self, finish_time): - log.info("%s, match id: %s" % (self.get_name(), self.match.match_id)) + if self.conf.verbose: + log.info("%s, match id: %s" % (self.get_name(), self.match.match_id)) game_info = self.match.game_info diff --git a/src/ggplearn/player/puctplayer.py b/src/ggplearn/player/puctplayer.py index baaf0e4..2fe7e29 100644 --- a/src/ggplearn/player/puctplayer.py +++ b/src/ggplearn/player/puctplayer.py @@ -116,8 +116,10 @@ def __init__(self, conf=None): MatchPlayer.__init__(self, identifier) def on_meta_gaming(self, finish_time): + if self.conf.verbose: + log.info("PUCTPlayer, match id: %s" % self.match.match_id) + self.root = None - log.info("PUCTPlayer, match id: %s" % self.match.match_id) sm = self.match.sm game_info = self.match.game_info diff --git a/src/ggplearn/training/approximate_play.py b/src/ggplearn/training/approximate_play.py index fbd241d..c3f2589 100644 --- a/src/ggplearn/training/approximate_play.py +++ b/src/ggplearn/training/approximate_play.py @@ -1,3 +1,11 @@ +""" +back to square 1, more or less + * first play is random-ish with policy player - gm_select + * second uses puct player, just on selected state - gm_policy + * third starting from the same state as policy was trained + on (not the resultant state), policy player for score - gm_score +""" + import time import numpy as np @@ -10,7 +18,8 @@ from ggplearn.util import attrutil from ggplearn import msgdefs -from ggplearn.player import mc +from ggplearn.player.puctplayer import PUCTPlayer +from ggplearn.player.policyplayer import PolicyPlayer class Runner(object): @@ -22,23 +31,25 @@ def __init__(self, conf): self.conf = conf # create two game masters, one for the score playout, and one for the policy evaluation - self.gm = GameMaster(get_gdl_for_game(self.conf.game)) - self.gm_policy = GameMaster(get_gdl_for_game(self.conf.game)) - self.gm_score = GameMaster(get_gdl_for_game(self.conf.game)) + self.gm_select = GameMaster(get_gdl_for_game(self.conf.game), fast_reset=True) + self.gm_policy = GameMaster(get_gdl_for_game(self.conf.game), fast_reset=True) + self.gm_score = GameMaster(get_gdl_for_game(self.conf.game), fast_reset=True) # add players to gamemasteres - for role in self.gm_score.sm.get_roles(): - player = mc.PUCTPlayer(self.conf.score_generation, - self.conf.score_puct_player_conf) - self.gm_score.add_player(player, role) + for role in self.gm_select.sm.get_roles(): + self.gm_select.add_player(PolicyPlayer(self.conf.player_select_conf), role) for role in self.gm_policy.sm.get_roles(): - player = mc.PUCTPlayer(self.conf.policy_generation, - self.conf.policy_puct_player_conf) - self.gm_policy.add_player(player, role) + self.gm_policy.add_player(PUCTPlayer(self.conf.player_policy_conf), role) + + for role in self.gm_score.sm.get_roles(): + self.gm_score.add_player(PolicyPlayer(self.conf.player_score_conf), role) # cache a local statemachine basestate (doesn't matter which gm it comes from) - self.basestate = self.gm_policy.sm.new_base_state() + self.basestate = self.gm_select.sm.new_base_state() + + # and cache roles + self.roles = self.gm_select.sm.get_roles() # we want unique samples per generation, so store a unique_set here self.unique_states = set() @@ -48,31 +59,32 @@ def reset_debug(self): # debug times self.acc_time_for_play_one_game = 0 self.acc_time_for_do_policy = 0 + self.acc_time_for_do_score = 0 def add_to_unique_states(self, state): self.unique_states.add(state) def get_bases(self): - self.gm_score.sm.get_current_state(self.basestate) + self.gm_select.sm.get_current_state(self.basestate) return tuple(self.basestate.to_list()) - def play_one_game(self): - self.gm_score.reset() + def play_one_game_for_selection(self): + self.gm_select.reset() - self.gm_score.start(meta_time=20, move_time=10) + self.gm_select.start(meta_time=20, move_time=10) states = [(0, self.get_bases())] last_move = None depth = 1 - while not self.gm_score.finished(): - last_move = self.gm_score.play_single_move(last_move=last_move) + while not self.gm_select.finished(): + last_move = self.gm_select.play_single_move(last_move=last_move) states.append((depth, self.get_bases())) depth += 1 # cleanup - self.gm_score.play_to_end(last_move) - return states, self.gm_score.scores + self.gm_select.play_to_end(last_move) + return states def do_policy(self, state): for i, v in enumerate(state): @@ -96,25 +108,29 @@ def do_policy(self, state): dist = [(c.legal, p) for c, p in player.get_probabilities(self.conf.temperature)] return dist, lead_role_index + def do_score(self, state): + for i, v in enumerate(state): + self.basestate.set(i, v) + + self.gm_score.reset() + self.gm_score.start(meta_time=30, move_time=10, initial_basestate=self.basestate) + self.gm_score.play_to_end() + + # return a list of scores as we expect them in the neural network + return [self.gm_score.get_score(role) / 100.0 for role in self.roles] + def generate_sample(self): # debug - score_player = self.gm_score.get_player(0) - policy_player = self.gm_policy.get_player(0) - log.debug("generate_sample() gens score: %s, policy: %s" % (score_player.generation, - policy_player.generation)) - log.debug("iterations score: %s, policy: %s" % (score_player.conf.num_of_playouts_per_iteration, - policy_player.conf.num_of_playouts_per_iteration)) - + log.debug("Entering generate_sample()") log.debug("unique_states: %s" % len(self.unique_states)) start_time = time.time() - states, final_score = self.play_one_game() - game_length = len(states) - - log.debug("Done play_one_game(), game_length %d" % game_length) - + states = self.play_one_game_for_selection() self.acc_time_for_play_one_game += time.time() - start_time + game_length = len(states) + log.debug("Done play_one_game_for_selection(), game_length %d" % game_length) + shuffle_states = states[:] # pop the final state, as we don't want terminal states. But keep in states intact @@ -132,15 +148,18 @@ def generate_sample(self): start_time = time.time() policy_dist, lead_role_index = self.do_policy(state) - log.debug("Done do_policy()") self.acc_time_for_do_policy += time.time() - start_time - prev2 = None # states[depth - 3] if depth >= 3 else None - prev1 = None # states[depth - 2] if depth >= 2 else None - prev0 = None # states[depth - 1] if depth >= 1 else None + # start from state and not from what policy returns (which would add bias) + start_time = time.time() + final_score = self.do_score(state) + log.debug("Done do_score()") + self.acc_time_for_do_score += time.time() - start_time + - sample = msgdefs.Sample(prev2, prev1, prev0, + prev_state = states[depth - 1] if depth >= 1 else None + sample = msgdefs.Sample(prev_state, state, policy_dist, final_score, depth, game_length, lead_role_index) diff --git a/src/test/player/test_performance.py b/src/test/player/test_performance.py index c9f7f80..df9bb0f 100644 --- a/src/test/player/test_performance.py +++ b/src/test/player/test_performance.py @@ -8,6 +8,7 @@ from ggplearn.player.puctplayer import PUCTPlayer from ggplearn.player.policyplayer import PolicyPlayer +current_gen = "testgen_normal_1" def setup(): from ggplib.util.init import setup_once @@ -22,7 +23,7 @@ def test_speed_of_one_shot(): gm = GameMaster(get_gdl_for_game("breakthrough")) - conf = msgdefs.PolicyPlayerConf(generation="testgen_normal_1", verbose=False) + conf = msgdefs.PolicyPlayerConf(generation=current_gen, verbose=False) white = PolicyPlayer(conf) black = PolicyPlayer(conf) @@ -52,12 +53,12 @@ def test_speed_of_one_simulation(): gm = GameMaster(get_gdl_for_game("breakthrough")) conf_puct = msgdefs.PUCTPlayerConf(verbose=False, - generation="testgen_normal_1", + generation=current_gen, playouts_per_iteration=800, playouts_per_iteration_noop=0, dirichlet_noise_alpha=-1, expand_root=-1) - conf_policy = msgdefs.PolicyPlayerConf(generation="testgen_normal_1", verbose=False) + conf_policy = msgdefs.PolicyPlayerConf(generation=current_gen, verbose=False) # add two players white = PUCTPlayer(conf=conf_puct) diff --git a/src/test/test_approx_play.py b/src/test/test_approx_play.py index 56e090a..67968d2 100644 --- a/src/test/test_approx_play.py +++ b/src/test/test_approx_play.py @@ -20,33 +20,32 @@ from ggplearn import msgdefs +current_gen = "testgen_normal_1" + + def go_test(): conf = msgdefs.ConfigureApproxTrainer() - conf.policy_generation = "gen9_small" - conf.score_generation = "gen9_smaller" - - conf.score_puct_player_conf = msgdefs.PUCTPlayerConf(name="score_puct", - verbose=False, - num_of_playouts_per_iteration=32, - num_of_playouts_per_iteration_noop=0, - expand_root=5, - dirichlet_noise_alpha=0.1, - cpuct_constant_first_4=0.75, - cpuct_constant_after_4=0.75, - choose="choose_converge") - - conf.policy_puct_player_conf = msgdefs.PUCTPlayerConf(name="policy_puct", - verbose=False, - num_of_playouts_per_iteration=800, - num_of_playouts_per_iteration_noop=0, - expand_root=5, - dirichlet_noise_alpha=-1, - cpuct_constant_first_4=3.0, - cpuct_constant_after_4=0.75, - choose="choose_converge") + conf.player_select_conf = msgdefs.PolicyPlayerConf(verbose=False, + generation=current_gen, + choose_exponential_scale=0.3) + + conf.player_policy_conf = msgdefs.PUCTPlayerConf(name="policy_puct", + verbose=False, + generation=current_gen, + playouts_per_iteration=800, + playouts_per_iteration_noop=0, + expand_root=100, + dirichlet_noise_alpha=-1, + cpuct_constant_first_4=3.0, + cpuct_constant_after_4=0.75, + choose="choose_converge") + + conf.player_score_conf = msgdefs.PolicyPlayerConf(verbose=False, + generation=current_gen, + choose_exponential_scale=-1) runner = Runner(conf) - number_of_samples = 5 + number_of_samples = 10 # slow first run runner.generate_sample() @@ -54,11 +53,18 @@ def go_test(): total_time = 0 - for _ in range(5): + for _ in range(number_of_samples): start = time.time() print runner.generate_sample() total_time += (time.time() - start) - print "average time to generate sample: %.2f" % (total_time / number_of_samples) + print "av time for play_one_game: %.2f" % (runner.acc_time_for_play_one_game / number_of_samples) print "av time for do_policy: %.2f" % (runner.acc_time_for_do_policy / number_of_samples) - print "av time for play one game: %.2f" % (runner.acc_time_for_play_one_game / number_of_samples) + print "av time for do_score: %.2f" % (runner.acc_time_for_do_score / number_of_samples) + + print "average time to generate sample: %.2f" % (total_time / number_of_samples) + + +if __name__ == "__main__": + from ggplearn.util.main import main_wrap + main_wrap(go_test) diff --git a/src/test/test_nn_train.py b/src/test/test_nn_train.py index 57cbac9..535f682 100644 --- a/src/test/test_nn_train.py +++ b/src/test/test_nn_train.py @@ -174,13 +174,13 @@ def nn_train_random_generated(): ' not a unit test - like can take over a few hours ! ' CREATE_FILE = False ACTUALLY_TRAIN = True - SAMPLE_COUNT = 10000 + SAMPLE_COUNT = 50000 train_conf = msgdefs.TrainNNRequest() train_conf.game = "breakthrough" - train_conf.network_size = "smaller" - train_conf.generation_prefix = "test2" + train_conf.network_size = "small" + train_conf.generation_prefix = "v3" train_conf.store_path = os.getcwd() # uses previous network