Skip to content
This repository has been archived by the owner on Jul 29, 2024. It is now read-only.

Commit

Permalink
get everything working again
Browse files Browse the repository at this point in the history
  • Loading branch information
richemslie committed Dec 20, 2017
1 parent 42fec27 commit 3ff08fa
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 114 deletions.
82 changes: 43 additions & 39 deletions src/ggplearn/distributed/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,39 +30,36 @@ def default_conf():
conf.game = "breakthrough"
conf.current_step = 1

conf.score_network_size = "smaller"
conf.policy_network_size = "smaller"

conf.generation_prefix = "v2_"
conf.store_path = os.path.join(os.environ["GGPLEARN_PATH"], "data", "breakthrough", "v2")

conf.score_player_conf = msgdefs.PUCTPlayerConf(name="score_puct",
verbose=False,
num_of_playouts_per_iteration=32,
num_of_playouts_per_iteration_noop=0,
expand_root=5,
dirichlet_noise_alpha=0.1,
cpuct_constant_first_4=0.75,
cpuct_constant_after_4=0.75,
choose="choose_converge")

conf.policy_player_conf = msgdefs.PUCTPlayerConf(name="policy_puct",
conf.score_network_size = "small"
conf.policy_network_size = "small"

conf.generation_prefix = "v3_"
conf.store_path = os.path.join(os.environ["GGPLEARN_PATH"], "data", "breakthrough", "v3")

# generation set on server
conf.player_select_conf = msgdefs.PolicyPlayerConf(verbose=False,
choose_exponential_scale=0.3)

conf.player_policy_conf = msgdefs.PUCTPlayerConf(name="policy_puct",
verbose=False,
num_of_playouts_per_iteration=200,
# num_of_playouts_per_iteration=800,
num_of_playouts_per_iteration_noop=0,
expand_root=5,
playouts_per_iteration=800,
playouts_per_iteration_noop=0,
expand_root=100,
dirichlet_noise_alpha=-1,
cpuct_constant_first_4=3.0,
cpuct_constant_after_4=0.75,
choose="choose_converge")

conf.player_score_conf = msgdefs.PolicyPlayerConf(verbose=False,
choose_exponential_scale=-1)
conf.generation_size = 32
conf.max_growth_while_training = 0.2

conf.validation_split = 0.9
conf.validation_split = 0.8
conf.batch_size = 32
conf.epochs = 4
conf.max_sample_count = 100000
conf.run_post_training_cmds = []

return conf

Expand Down Expand Up @@ -139,13 +136,10 @@ def __init__(self, conf_filename):
def check_nn_generations_exist(self):
score_gen = self.get_score_generation(self.conf.current_step)
policy_gen = self.get_policy_generation(self.conf.current_step)

log.debug("current policy gen %s" % score_gen)
log.debug("current score gen %s" % policy_gen)

# create them (will overwrite even exist)
policy_gen = self.get_policy_generation(self.conf.current_step)
score_gen = self.get_score_generation(self.conf.current_step)

for g in (policy_gen, score_gen):
net = network.create(g, self.game_info, load=False)
if not net.can_load():
Expand All @@ -156,10 +150,11 @@ def check_nn_generations_exist(self):
critical_error("Did not find network %s. exiting." % g)

def save_our_config(self, rolled=False):
if rolled:
shutil.copy(self.conf_filename, self.conf_filename + "-%00d" % (self.conf.current_step - 1))
else:
shutil.copy(self.conf_filename, self.conf_filename + "-bak")
if os.path.exists(self.conf_filename):
if rolled:
shutil.copy(self.conf_filename, self.conf_filename + "-%00d" % (self.conf.current_step - 1))
else:
shutil.copy(self.conf_filename, self.conf_filename + "-bak")

with open(self.conf_filename, 'w') as open_file:
open_file.write(attrutil.attr_to_json(self.conf, indent=4))
Expand Down Expand Up @@ -288,7 +283,8 @@ def new_generation(self):
json.encoder.FLOAT_REPR = lambda f: ("%.5f" % f)

log.info("writing json")
filename = os.path.join(self.conf.store_path, "gendata_%s.json" % self.conf.current_step)
filename = os.path.join(self.conf.store_path, "gendata_%s_%s.json" % (self.conf.game,
self.conf.current_step))
with open(filename, 'w') as open_file:
open_file.write(attrutil.attr_to_json(gen, indent=4))

Expand All @@ -302,7 +298,7 @@ def new_generation(self):
m.generation_prefix = self.conf.generation_prefix
m.store_path = self.conf.store_path

m.use_previous = True # until we are big enough, what is the point?
m.use_previous = False # until we are big enough, what is the point?

m.next_step = self.conf.current_step + 1
m.validation_split = self.conf.validation_split
Expand Down Expand Up @@ -348,15 +344,22 @@ def roll_generation(self):

self.generation = None
log.warning("roll_generation() complete. We have %s samples leftover" % len(self.accumulated_samples))
self.schedule_players()

def create_approx_config(self):
conf = msgdefs.ConfigureApproxTrainer()
conf.game = self.conf.game
conf.policy_generation = self.get_policy_generation(self.conf.current_step)
conf.score_generation = self.get_score_generation(self.conf.current_step)
conf.temperature = 1.0
conf.score_puct_player_conf = self.conf.score_player_conf
conf.policy_puct_player_conf = self.conf.policy_player_conf
# we use score_gen for select also XXX we should probably just go to one
policy_generation = self.get_policy_generation(self.conf.current_step)
score_generation = self.get_score_generation(self.conf.current_step)

self.conf.player_select_conf.generation = score_generation
self.conf.player_policy_conf.generation = policy_generation
self.conf.player_score_conf.generation = score_generation

conf = msgdefs.ConfigureApproxTrainer(game=self.conf.game)
conf.player_select_conf = self.conf.player_select_conf
conf.player_policy_conf = self.conf.player_policy_conf
conf.player_score_conf = self.conf.player_score_conf

self.approx_play_config = conf

def schedule_players(self):
Expand Down Expand Up @@ -395,6 +398,7 @@ def start_server_factory():
setup_once("worker")

ServerBroker(sys.argv[1])

reactor.run()


Expand Down
7 changes: 3 additions & 4 deletions src/ggplearn/msgdefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ class ServerConfig(object):
generation_prefix = attr.ib("v2_")
store_path = attr.ib("somewhere")

policy_player_conf = attr.ib(default=attr.Factory(PUCTPlayerConf))
score_player_conf = attr.ib(default=attr.Factory(PUCTPlayerConf))
player_select_conf = attr.ib(default=attr.Factory(PolicyPlayerConf))
player_policy_conf = attr.ib(default=attr.Factory(PUCTPlayerConf))
player_score_conf = attr.ib(default=attr.Factory(PolicyPlayerConf))

generation_size = attr.ib(1024)
max_growth_while_training = attr.ib(0.2)
Expand Down Expand Up @@ -106,8 +107,6 @@ class Ok(object):
@attr.s
class ConfigureApproxTrainer(object):
game = attr.ib("breakthrough")
policy_generation = attr.ib("gen0_small")
score_generation = attr.ib("gen0_smaller")
temperature = attr.ib(1.0)
player_select_conf = attr.ib(default=attr.Factory(PolicyPlayerConf))
player_policy_conf = attr.ib(default=attr.Factory(PUCTPlayerConf))
Expand Down
3 changes: 2 additions & 1 deletion src/ggplearn/player/policyplayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def __init__(self, conf):
self.bases_config = None

def on_meta_gaming(self, finish_time):
log.info("%s, match id: %s" % (self.get_name(), self.match.match_id))
if self.conf.verbose:
log.info("%s, match id: %s" % (self.get_name(), self.match.match_id))

game_info = self.match.game_info

Expand Down
4 changes: 3 additions & 1 deletion src/ggplearn/player/puctplayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ def __init__(self, conf=None):
MatchPlayer.__init__(self, identifier)

def on_meta_gaming(self, finish_time):
if self.conf.verbose:
log.info("PUCTPlayer, match id: %s" % self.match.match_id)

self.root = None
log.info("PUCTPlayer, match id: %s" % self.match.match_id)

sm = self.match.sm
game_info = self.match.game_info
Expand Down
93 changes: 56 additions & 37 deletions src/ggplearn/training/approximate_play.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
"""
back to square 1, more or less
* first play is random-ish with policy player - gm_select
* second uses puct player, just on selected state - gm_policy
* third starting from the same state as policy was trained
on (not the resultant state), policy player for score - gm_score
"""

import time

import numpy as np
Expand All @@ -10,7 +18,8 @@
from ggplearn.util import attrutil

from ggplearn import msgdefs
from ggplearn.player import mc
from ggplearn.player.puctplayer import PUCTPlayer
from ggplearn.player.policyplayer import PolicyPlayer


class Runner(object):
Expand All @@ -22,23 +31,25 @@ def __init__(self, conf):
self.conf = conf

# create two game masters, one for the score playout, and one for the policy evaluation
self.gm = GameMaster(get_gdl_for_game(self.conf.game))
self.gm_policy = GameMaster(get_gdl_for_game(self.conf.game))
self.gm_score = GameMaster(get_gdl_for_game(self.conf.game))
self.gm_select = GameMaster(get_gdl_for_game(self.conf.game), fast_reset=True)
self.gm_policy = GameMaster(get_gdl_for_game(self.conf.game), fast_reset=True)
self.gm_score = GameMaster(get_gdl_for_game(self.conf.game), fast_reset=True)

# add players to gamemasteres
for role in self.gm_score.sm.get_roles():
player = mc.PUCTPlayer(self.conf.score_generation,
self.conf.score_puct_player_conf)
self.gm_score.add_player(player, role)
for role in self.gm_select.sm.get_roles():
self.gm_select.add_player(PolicyPlayer(self.conf.player_select_conf), role)

for role in self.gm_policy.sm.get_roles():
player = mc.PUCTPlayer(self.conf.policy_generation,
self.conf.policy_puct_player_conf)
self.gm_policy.add_player(player, role)
self.gm_policy.add_player(PUCTPlayer(self.conf.player_policy_conf), role)

for role in self.gm_score.sm.get_roles():
self.gm_score.add_player(PolicyPlayer(self.conf.player_score_conf), role)

# cache a local statemachine basestate (doesn't matter which gm it comes from)
self.basestate = self.gm_policy.sm.new_base_state()
self.basestate = self.gm_select.sm.new_base_state()

# and cache roles
self.roles = self.gm_select.sm.get_roles()

# we want unique samples per generation, so store a unique_set here
self.unique_states = set()
Expand All @@ -48,31 +59,32 @@ def reset_debug(self):
# debug times
self.acc_time_for_play_one_game = 0
self.acc_time_for_do_policy = 0
self.acc_time_for_do_score = 0

def add_to_unique_states(self, state):
self.unique_states.add(state)

def get_bases(self):
self.gm_score.sm.get_current_state(self.basestate)
self.gm_select.sm.get_current_state(self.basestate)
return tuple(self.basestate.to_list())

def play_one_game(self):
self.gm_score.reset()
def play_one_game_for_selection(self):
self.gm_select.reset()

self.gm_score.start(meta_time=20, move_time=10)
self.gm_select.start(meta_time=20, move_time=10)

states = [(0, self.get_bases())]

last_move = None
depth = 1
while not self.gm_score.finished():
last_move = self.gm_score.play_single_move(last_move=last_move)
while not self.gm_select.finished():
last_move = self.gm_select.play_single_move(last_move=last_move)
states.append((depth, self.get_bases()))
depth += 1

# cleanup
self.gm_score.play_to_end(last_move)
return states, self.gm_score.scores
self.gm_select.play_to_end(last_move)
return states

def do_policy(self, state):
for i, v in enumerate(state):
Expand All @@ -96,25 +108,29 @@ def do_policy(self, state):
dist = [(c.legal, p) for c, p in player.get_probabilities(self.conf.temperature)]
return dist, lead_role_index

def do_score(self, state):
for i, v in enumerate(state):
self.basestate.set(i, v)

self.gm_score.reset()
self.gm_score.start(meta_time=30, move_time=10, initial_basestate=self.basestate)
self.gm_score.play_to_end()

# return a list of scores as we expect them in the neural network
return [self.gm_score.get_score(role) / 100.0 for role in self.roles]

def generate_sample(self):
# debug
score_player = self.gm_score.get_player(0)
policy_player = self.gm_policy.get_player(0)
log.debug("generate_sample() gens score: %s, policy: %s" % (score_player.generation,
policy_player.generation))
log.debug("iterations score: %s, policy: %s" % (score_player.conf.num_of_playouts_per_iteration,
policy_player.conf.num_of_playouts_per_iteration))

log.debug("Entering generate_sample()")
log.debug("unique_states: %s" % len(self.unique_states))

start_time = time.time()
states, final_score = self.play_one_game()
game_length = len(states)

log.debug("Done play_one_game(), game_length %d" % game_length)

states = self.play_one_game_for_selection()
self.acc_time_for_play_one_game += time.time() - start_time

game_length = len(states)
log.debug("Done play_one_game_for_selection(), game_length %d" % game_length)

shuffle_states = states[:]

# pop the final state, as we don't want terminal states. But keep in states intact
Expand All @@ -132,15 +148,18 @@ def generate_sample(self):

start_time = time.time()
policy_dist, lead_role_index = self.do_policy(state)

log.debug("Done do_policy()")
self.acc_time_for_do_policy += time.time() - start_time

prev2 = None # states[depth - 3] if depth >= 3 else None
prev1 = None # states[depth - 2] if depth >= 2 else None
prev0 = None # states[depth - 1] if depth >= 1 else None
# start from state and not from what policy returns (which would add bias)
start_time = time.time()
final_score = self.do_score(state)
log.debug("Done do_score()")
self.acc_time_for_do_score += time.time() - start_time


sample = msgdefs.Sample(prev2, prev1, prev0,
prev_state = states[depth - 1] if depth >= 1 else None
sample = msgdefs.Sample(prev_state,
state, policy_dist, final_score,
depth, game_length, lead_role_index)

Expand Down
7 changes: 4 additions & 3 deletions src/test/player/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ggplearn.player.puctplayer import PUCTPlayer
from ggplearn.player.policyplayer import PolicyPlayer

current_gen = "testgen_normal_1"

def setup():
from ggplib.util.init import setup_once
Expand All @@ -22,7 +23,7 @@ def test_speed_of_one_shot():

gm = GameMaster(get_gdl_for_game("breakthrough"))

conf = msgdefs.PolicyPlayerConf(generation="testgen_normal_1", verbose=False)
conf = msgdefs.PolicyPlayerConf(generation=current_gen, verbose=False)

white = PolicyPlayer(conf)
black = PolicyPlayer(conf)
Expand Down Expand Up @@ -52,12 +53,12 @@ def test_speed_of_one_simulation():
gm = GameMaster(get_gdl_for_game("breakthrough"))

conf_puct = msgdefs.PUCTPlayerConf(verbose=False,
generation="testgen_normal_1",
generation=current_gen,
playouts_per_iteration=800,
playouts_per_iteration_noop=0,
dirichlet_noise_alpha=-1,
expand_root=-1)
conf_policy = msgdefs.PolicyPlayerConf(generation="testgen_normal_1", verbose=False)
conf_policy = msgdefs.PolicyPlayerConf(generation=current_gen, verbose=False)

# add two players
white = PUCTPlayer(conf=conf_puct)
Expand Down
Loading

0 comments on commit 3ff08fa

Please sign in to comment.