-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
104 changed files
with
12,809 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
*.zip | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
##!/bin/bash | ||
|
||
lichesss_raw_dir='/data/chess/bz2/standard/' | ||
output_dir='../../data/player_counts' | ||
mkdir -p $output_dir | ||
|
||
for t in $lichesss_raw_dir/*-{01..11}.pgn.bz2 $lichesss_raw_dir/*{3..8}-12.pgn.bz2; do | ||
fname="$(basename -- $t)" | ||
echo "${t} ${output_dir}/${fname}.csv.bz2" | ||
screen -S "filter-${fname}" -dm bash -c "source ~/.bashrc; python3 find_top_players.py ${t} ${output_dir}/${fname}.csv.bz2" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
##!/bin/bash | ||
|
||
lichesss_raw_dir='/data/chess/bz2/standard/' | ||
counts_dir='../../data/player_counts' | ||
counts_file='../../data/player_counts_combined.csv.bz2' | ||
top_list='../../data/player_counts_combined_top_names.csv.bz2' | ||
|
||
output_2000_dir='../../data/top_2000_player_games' | ||
output_2000_metadata_dir='../../data/top_2000_player_data' | ||
|
||
players_list='../../data/select_transfer_players' | ||
|
||
final_data_dir='../../data/transfer_players_data' | ||
|
||
num_train=10 | ||
num_val=900 | ||
num_test=100 | ||
|
||
python3 combine_player_counts.py $counts_dir/* $counts_file | ||
|
||
bzcat $counts_file | head -n 2000 | bzip2 > $top_list | ||
|
||
mkdir -p $output_2000_dir | ||
|
||
python3 split_by_players.py $top_list $lichesss_raw_dir/*-{01..11}.pgn.bz2 $lichesss_raw_dir/*{3..8}-12.pgn.bz2 $output_2000_dir | ||
|
||
rm -v $top_list | ||
|
||
mkdir -p $output_2000_metadata_dir | ||
|
||
python3 player_game_counts.py $output_2000_dir $output_2000_metadata_dir | ||
|
||
python3 select_top_players.py $output_2000_metadata_dir \ | ||
${players_list}_train.csv $num_train \ | ||
${players_list}_validate.csv $num_val \ | ||
${players_list}_test.csv $num_test \ | ||
|
||
mkdir -p $final_data_dir | ||
mkdir -p $final_data_dir/metadata | ||
cp -v ${players_list}*.csv $final_data_dir/metadata | ||
|
||
for c in "train" "validate" "test"; do | ||
mkdir $final_data_dir/${c} | ||
mkdir $final_data_dir/${c}_metadata | ||
for t in `tail -n +2 ${players_list}_${c}.csv|awk -F ',' '{print $1}'`; do | ||
cp -v ${output_2000_dir}/${t}.pgn.bz2 $final_data_dir/${c} | ||
cp ${output_2000_metadata_dir}/${t}.csv.bz2 $final_data_dir/${c}_metadata | ||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
##!/bin/bash | ||
set -e | ||
|
||
vals_dat_dir="../../data/transfer_players_data/validate_metadata/" | ||
vals_dir="../../data/transfer_players_validate" | ||
output_dir="../../data/transfer_players_extended" | ||
list_file='../../data/extended_list.csv' | ||
|
||
num_per_bin=5 | ||
bins="1100 1300 1500 1700 1900" | ||
|
||
|
||
python3 select_binned_players.py $vals_dat_dir $list_file $num_per_bin $bins | ||
|
||
mkdir -p $output_dir | ||
|
||
while read player; do | ||
echo $player | ||
cp -r ${vals_dir}/${player} ${output_dir} | ||
done < $list_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Player Counting | ||
|
||
This is the code we used to count the number of games each player has. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import backend | ||
|
||
import argparse | ||
import bz2 | ||
|
||
import pandas | ||
|
||
@backend.logged_main | ||
def main(): | ||
parser = argparse.ArgumentParser(description='Collect counts and create list from them', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('inputs', nargs = '+', help='input csvs') | ||
parser.add_argument('output', help='output csv') | ||
args = parser.parse_args() | ||
|
||
counts = {} | ||
for p in args.inputs: | ||
backend.printWithDate(f"Processing {p}", end = '\r') | ||
df = pandas.read_csv(p) | ||
for i, row in df.iterrows(): | ||
try: | ||
counts[row['player']] += row['count'] | ||
except KeyError: | ||
counts[row['player']] = row['count'] | ||
backend.printWithDate(f"Writing") | ||
with bz2.open(args.output, 'wt') as f: | ||
f.write('player,count\n') | ||
for p, c in sorted(counts.items(), key = lambda x: x[1], reverse=True): | ||
f.write(f"{p},{c}\n") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import backend | ||
|
||
import argparse | ||
import bz2 | ||
|
||
@backend.logged_main | ||
def main(): | ||
parser = argparse.ArgumentParser(description='Count number of times each player occurs in pgn', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
|
||
parser.add_argument('input', help='input pgn') | ||
parser.add_argument('output', help='output csv') | ||
parser.add_argument('--exclude_bullet', action='store_false', help='Remove bullet games from counts') | ||
args = parser.parse_args() | ||
|
||
games = backend.GamesFile(args.input) | ||
|
||
counts = {} | ||
|
||
for i, (d, _) in enumerate(games): | ||
if args.exclude_bullet and 'Bullet' in d['Event']: | ||
continue | ||
else: | ||
add_player(d['White'], counts) | ||
add_player(d['Black'], counts) | ||
if i % 10000 == 0: | ||
backend.printWithDate(f"{i} done with {len(counts)} players from {args.input}", end = '\r') | ||
|
||
backend.printWithDate(f"{i} found total of {len(counts)} players from {args.input}") | ||
with bz2.open(args.output, 'wt') as f: | ||
f.write("player,count\n") | ||
for p, c in sorted(counts.items(), key = lambda x: x[1], reverse=True): | ||
f.write(f"{p},{c}\n") | ||
backend.printWithDate("done") | ||
|
||
def add_player(p, d): | ||
try: | ||
d[p] += 1 | ||
except KeyError: | ||
d[p] = 1 | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import backend | ||
|
||
import os | ||
import os.path | ||
import csv | ||
import bz2 | ||
import argparse | ||
|
||
@backend.logged_main | ||
def main(): | ||
parser = argparse.ArgumentParser(description='Get some stats about each of the games') | ||
parser.add_argument('targets_dir', help='input pgns dir') | ||
parser.add_argument('output_dir', help='output csvs dir') | ||
parser.add_argument('--pool_size', type=int, help='Number of models to run in parallel', default = 64) | ||
args = parser.parse_args() | ||
multiProc = backend.Multiproc(args.pool_size) | ||
multiProc.reader_init(Files_lister, args.targets_dir) | ||
multiProc.processor_init(Games_processor, args.output_dir) | ||
|
||
multiProc.run() | ||
|
||
class Files_lister(backend.MultiprocIterable): | ||
def __init__(self, targets_dir): | ||
self.targets_dir = targets_dir | ||
self.targets = [(p.path, p.name.split('.')[0]) for p in os.scandir(targets_dir) if '.pgn.bz2' in p.name] | ||
backend.printWithDate(f"Found {len(self.targets)} targets in {targets_dir}") | ||
def __next__(self): | ||
try: | ||
backend.printWithDate(f"Pushed target {len(self.targets)} remaining", end = '\r', flush = True) | ||
return self.targets.pop() | ||
except IndexError: | ||
raise StopIteration | ||
|
||
class Games_processor(backend.MultiprocWorker): | ||
def __init__(self, output_dir): | ||
self.output_dir = output_dir | ||
|
||
def __call__(self, path, name): | ||
games = backend.GamesFile(path) | ||
with bz2.open(os.path.join(self.output_dir, f"{name}.csv.bz2"), 'wt') as f: | ||
writer = csv.DictWriter(f, ["player", "opponent","game_id", "ELO", "opp_ELO", "was_white", "result", "won", "UTCDate", "UTCTime", "TimeControl"]) | ||
|
||
writer.writeheader() | ||
for d, _ in games: | ||
game_dat = {} | ||
game_dat['player'] = name | ||
game_dat['game_id'] = d['Site'].split('/')[-1] | ||
game_dat['result'] = d['Result'] | ||
game_dat['UTCDate'] = d['UTCDate'] | ||
game_dat['UTCTime'] = d['UTCTime'] | ||
game_dat['TimeControl'] = d['TimeControl'] | ||
if d['Black'] == name: | ||
game_dat['was_white'] = False | ||
game_dat['opponent'] = d['White'] | ||
game_dat['ELO'] = d['BlackElo'] | ||
game_dat['opp_ELO'] = d['WhiteElo'] | ||
game_dat['won'] = d['Result'] == '0-1' | ||
else: | ||
game_dat['was_white'] = True | ||
game_dat['opponent'] = d['Black'] | ||
game_dat['ELO'] = d['WhiteElo'] | ||
game_dat['opp_ELO'] = d['BlackElo'] | ||
game_dat['won'] = d['Result'] == '1-0' | ||
writer.writerow(game_dat) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import backend | ||
|
||
import argparse | ||
import bz2 | ||
import glob | ||
import random | ||
import os.path | ||
import multiprocessing | ||
|
||
import pandas | ||
|
||
@backend.logged_main | ||
def main(): | ||
parser = argparse.ArgumentParser(description='Read all the metadata and select top n players for training/validation/testing', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('csvs_dir', help='dir of csvs') | ||
parser.add_argument('output_list', help='list of targets') | ||
parser.add_argument('bin_size', type=int, help='players per bin') | ||
parser.add_argument('bins', type=int, nargs = '+', help='bins') | ||
parser.add_argument('--pool_size', type=int, help='Number of threads to use for reading', default = 48) | ||
parser.add_argument('--seed', type=int, help='random seed', default = 1) | ||
args = parser.parse_args() | ||
random.seed(args.seed) | ||
|
||
bins = [int(b // 100 * 100) for b in args.bins] | ||
|
||
with multiprocessing.Pool(args.pool_size) as pool: | ||
players = pool.map(load_player, glob.glob(os.path.join(args.csvs_dir, '*.csv.bz2'))) | ||
backend.printWithDate(f"Found {len(players)} players, using {len(bins)} bins") | ||
binned_players = {b : [] for b in bins} | ||
for p in players: | ||
pe_round = int(p['elo'] // 100 * 100) | ||
if pe_round in bins: | ||
binned_players[pe_round].append(p) | ||
backend.printWithDate(f"Found: " + ', '.join([f"{b} : {len(p)}" for b, p in binned_players.items()])) | ||
|
||
with open(args.output_list, 'wt') as f: | ||
for b, p in binned_players.items(): | ||
random.shuffle(p) | ||
print(b, [d['name'] for d in p[:args.bin_size]]) | ||
f.write('\n'.join([d['name'] for d in p[:args.bin_size]]) +'\n') | ||
|
||
def load_player(path): | ||
df = pandas.read_csv(path, low_memory=False) | ||
elo = df['ELO'][-10000:].mean() | ||
count = len(df) | ||
return { | ||
'name' : df['player'].iloc[0], | ||
'elo' : elo, | ||
'count' : count, | ||
} | ||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import backend | ||
|
||
import argparse | ||
import bz2 | ||
import glob | ||
import random | ||
import os.path | ||
import multiprocessing | ||
|
||
import pandas | ||
|
||
@backend.logged_main | ||
def main(): | ||
parser = argparse.ArgumentParser(description='Read all the metadata and select top n players for training/validation/testing', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('inputs', help='input csvs dir') | ||
parser.add_argument('output_train', help='output csv for training data') | ||
parser.add_argument('num_train', type=int, help='num for main training') | ||
parser.add_argument('output_val', help='output csv for validation data') | ||
parser.add_argument('num_val', type=int, help='num for big validation run') | ||
parser.add_argument('output_test', help='output csv for testing data') | ||
parser.add_argument('num_test', type=int, help='num for holdout set') | ||
parser.add_argument('--pool_size', type=int, help='Number of models to run in parallel', default = 48) | ||
parser.add_argument('--min_elo', type=int, help='min elo to select', default = 1100) | ||
parser.add_argument('--max_elo', type=int, help='max elo to select', default = 2000) | ||
parser.add_argument('--seed', type=int, help='random seed', default = 1) | ||
args = parser.parse_args() | ||
random.seed(args.seed) | ||
|
||
targets = glob.glob(os.path.join(args.inputs, '*csv.bz2')) | ||
|
||
with multiprocessing.Pool(args.pool_size) as pool: | ||
players = pool.starmap(check_player, ((t, args.min_elo, args.max_elo) for t in targets)) | ||
|
||
players_top = sorted( | ||
(p for p in players if p is not None), | ||
key = lambda x : x[1], | ||
reverse=True, | ||
)[:args.num_train + args.num_val + args.num_test] | ||
|
||
random.shuffle(players_top) | ||
|
||
write_output_file(args.output_train, args.num_train, players_top) | ||
write_output_file(args.output_val, args.num_val, players_top) | ||
write_output_file(args.output_test, args.num_test, players_top) | ||
|
||
def write_output_file(path, count, targets): | ||
with open(path, 'wt') as f: | ||
f.write("player,count,ELO\n") | ||
for i in range(count): | ||
t = targets.pop() | ||
f.write(f"{t[0]},{t[1]},{t[2]}\n") | ||
|
||
def check_player(path, min_elo, max_elo): | ||
df = pandas.read_csv(path, low_memory=False) | ||
elo = df['ELO'][-10000:].mean() | ||
count = len(df) | ||
if elo > min_elo and elo < max_elo: | ||
return path.split('/')[-1].split('.')[0], count, elo | ||
else: | ||
return None | ||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.