Skip to content

Commit

Permalink
KDD code release
Browse files Browse the repository at this point in the history
  • Loading branch information
reidmcy committed Aug 14, 2022
1 parent 47db14a commit 012c217
Show file tree
Hide file tree
Showing 104 changed files with 12,809 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.zip

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
11 changes: 11 additions & 0 deletions 0-player_counting/0-find_top_players.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
##!/bin/bash

lichesss_raw_dir='/data/chess/bz2/standard/'
output_dir='../../data/player_counts'
mkdir -p $output_dir

for t in $lichesss_raw_dir/*-{01..11}.pgn.bz2 $lichesss_raw_dir/*{3..8}-12.pgn.bz2; do
fname="$(basename -- $t)"
echo "${t} ${output_dir}/${fname}.csv.bz2"
screen -S "filter-${fname}" -dm bash -c "source ~/.bashrc; python3 find_top_players.py ${t} ${output_dir}/${fname}.csv.bz2"
done
49 changes: 49 additions & 0 deletions 0-player_counting/1-collect_top_players.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
##!/bin/bash

lichesss_raw_dir='/data/chess/bz2/standard/'
counts_dir='../../data/player_counts'
counts_file='../../data/player_counts_combined.csv.bz2'
top_list='../../data/player_counts_combined_top_names.csv.bz2'

output_2000_dir='../../data/top_2000_player_games'
output_2000_metadata_dir='../../data/top_2000_player_data'

players_list='../../data/select_transfer_players'

final_data_dir='../../data/transfer_players_data'

num_train=10
num_val=900
num_test=100

python3 combine_player_counts.py $counts_dir/* $counts_file

bzcat $counts_file | head -n 2000 | bzip2 > $top_list

mkdir -p $output_2000_dir

python3 split_by_players.py $top_list $lichesss_raw_dir/*-{01..11}.pgn.bz2 $lichesss_raw_dir/*{3..8}-12.pgn.bz2 $output_2000_dir

rm -v $top_list

mkdir -p $output_2000_metadata_dir

python3 player_game_counts.py $output_2000_dir $output_2000_metadata_dir

python3 select_top_players.py $output_2000_metadata_dir \
${players_list}_train.csv $num_train \
${players_list}_validate.csv $num_val \
${players_list}_test.csv $num_test \

mkdir -p $final_data_dir
mkdir -p $final_data_dir/metadata
cp -v ${players_list}*.csv $final_data_dir/metadata

for c in "train" "validate" "test"; do
mkdir $final_data_dir/${c}
mkdir $final_data_dir/${c}_metadata
for t in `tail -n +2 ${players_list}_${c}.csv|awk -F ',' '{print $1}'`; do
cp -v ${output_2000_dir}/${t}.pgn.bz2 $final_data_dir/${c}
cp ${output_2000_metadata_dir}/${t}.csv.bz2 $final_data_dir/${c}_metadata
done
done
20 changes: 20 additions & 0 deletions 0-player_counting/2-select_extended_set.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
##!/bin/bash
set -e

vals_dat_dir="../../data/transfer_players_data/validate_metadata/"
vals_dir="../../data/transfer_players_validate"
output_dir="../../data/transfer_players_extended"
list_file='../../data/extended_list.csv'

num_per_bin=5
bins="1100 1300 1500 1700 1900"


python3 select_binned_players.py $vals_dat_dir $list_file $num_per_bin $bins

mkdir -p $output_dir

while read player; do
echo $player
cp -r ${vals_dir}/${player} ${output_dir}
done < $list_file
3 changes: 3 additions & 0 deletions 0-player_counting/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Player Counting

This is the code we used to count the number of games each player has.
31 changes: 31 additions & 0 deletions 0-player_counting/combine_player_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import backend

import argparse
import bz2

import pandas

@backend.logged_main
def main():
parser = argparse.ArgumentParser(description='Collect counts and create list from them', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('inputs', nargs = '+', help='input csvs')
parser.add_argument('output', help='output csv')
args = parser.parse_args()

counts = {}
for p in args.inputs:
backend.printWithDate(f"Processing {p}", end = '\r')
df = pandas.read_csv(p)
for i, row in df.iterrows():
try:
counts[row['player']] += row['count']
except KeyError:
counts[row['player']] = row['count']
backend.printWithDate(f"Writing")
with bz2.open(args.output, 'wt') as f:
f.write('player,count\n')
for p, c in sorted(counts.items(), key = lambda x: x[1], reverse=True):
f.write(f"{p},{c}\n")

if __name__ == '__main__':
main()
42 changes: 42 additions & 0 deletions 0-player_counting/find_top_players.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import backend

import argparse
import bz2

@backend.logged_main
def main():
parser = argparse.ArgumentParser(description='Count number of times each player occurs in pgn', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('input', help='input pgn')
parser.add_argument('output', help='output csv')
parser.add_argument('--exclude_bullet', action='store_false', help='Remove bullet games from counts')
args = parser.parse_args()

games = backend.GamesFile(args.input)

counts = {}

for i, (d, _) in enumerate(games):
if args.exclude_bullet and 'Bullet' in d['Event']:
continue
else:
add_player(d['White'], counts)
add_player(d['Black'], counts)
if i % 10000 == 0:
backend.printWithDate(f"{i} done with {len(counts)} players from {args.input}", end = '\r')

backend.printWithDate(f"{i} found total of {len(counts)} players from {args.input}")
with bz2.open(args.output, 'wt') as f:
f.write("player,count\n")
for p, c in sorted(counts.items(), key = lambda x: x[1], reverse=True):
f.write(f"{p},{c}\n")
backend.printWithDate("done")

def add_player(p, d):
try:
d[p] += 1
except KeyError:
d[p] = 1

if __name__ == '__main__':
main()
67 changes: 67 additions & 0 deletions 0-player_counting/player_game_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import backend

import os
import os.path
import csv
import bz2
import argparse

@backend.logged_main
def main():
parser = argparse.ArgumentParser(description='Get some stats about each of the games')
parser.add_argument('targets_dir', help='input pgns dir')
parser.add_argument('output_dir', help='output csvs dir')
parser.add_argument('--pool_size', type=int, help='Number of models to run in parallel', default = 64)
args = parser.parse_args()
multiProc = backend.Multiproc(args.pool_size)
multiProc.reader_init(Files_lister, args.targets_dir)
multiProc.processor_init(Games_processor, args.output_dir)

multiProc.run()

class Files_lister(backend.MultiprocIterable):
def __init__(self, targets_dir):
self.targets_dir = targets_dir
self.targets = [(p.path, p.name.split('.')[0]) for p in os.scandir(targets_dir) if '.pgn.bz2' in p.name]
backend.printWithDate(f"Found {len(self.targets)} targets in {targets_dir}")
def __next__(self):
try:
backend.printWithDate(f"Pushed target {len(self.targets)} remaining", end = '\r', flush = True)
return self.targets.pop()
except IndexError:
raise StopIteration

class Games_processor(backend.MultiprocWorker):
def __init__(self, output_dir):
self.output_dir = output_dir

def __call__(self, path, name):
games = backend.GamesFile(path)
with bz2.open(os.path.join(self.output_dir, f"{name}.csv.bz2"), 'wt') as f:
writer = csv.DictWriter(f, ["player", "opponent","game_id", "ELO", "opp_ELO", "was_white", "result", "won", "UTCDate", "UTCTime", "TimeControl"])

writer.writeheader()
for d, _ in games:
game_dat = {}
game_dat['player'] = name
game_dat['game_id'] = d['Site'].split('/')[-1]
game_dat['result'] = d['Result']
game_dat['UTCDate'] = d['UTCDate']
game_dat['UTCTime'] = d['UTCTime']
game_dat['TimeControl'] = d['TimeControl']
if d['Black'] == name:
game_dat['was_white'] = False
game_dat['opponent'] = d['White']
game_dat['ELO'] = d['BlackElo']
game_dat['opp_ELO'] = d['WhiteElo']
game_dat['won'] = d['Result'] == '0-1'
else:
game_dat['was_white'] = True
game_dat['opponent'] = d['Black']
game_dat['ELO'] = d['WhiteElo']
game_dat['opp_ELO'] = d['BlackElo']
game_dat['won'] = d['Result'] == '1-0'
writer.writerow(game_dat)

if __name__ == '__main__':
main()
52 changes: 52 additions & 0 deletions 0-player_counting/select_binned_players.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import backend

import argparse
import bz2
import glob
import random
import os.path
import multiprocessing

import pandas

@backend.logged_main
def main():
parser = argparse.ArgumentParser(description='Read all the metadata and select top n players for training/validation/testing', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('csvs_dir', help='dir of csvs')
parser.add_argument('output_list', help='list of targets')
parser.add_argument('bin_size', type=int, help='players per bin')
parser.add_argument('bins', type=int, nargs = '+', help='bins')
parser.add_argument('--pool_size', type=int, help='Number of threads to use for reading', default = 48)
parser.add_argument('--seed', type=int, help='random seed', default = 1)
args = parser.parse_args()
random.seed(args.seed)

bins = [int(b // 100 * 100) for b in args.bins]

with multiprocessing.Pool(args.pool_size) as pool:
players = pool.map(load_player, glob.glob(os.path.join(args.csvs_dir, '*.csv.bz2')))
backend.printWithDate(f"Found {len(players)} players, using {len(bins)} bins")
binned_players = {b : [] for b in bins}
for p in players:
pe_round = int(p['elo'] // 100 * 100)
if pe_round in bins:
binned_players[pe_round].append(p)
backend.printWithDate(f"Found: " + ', '.join([f"{b} : {len(p)}" for b, p in binned_players.items()]))

with open(args.output_list, 'wt') as f:
for b, p in binned_players.items():
random.shuffle(p)
print(b, [d['name'] for d in p[:args.bin_size]])
f.write('\n'.join([d['name'] for d in p[:args.bin_size]]) +'\n')

def load_player(path):
df = pandas.read_csv(path, low_memory=False)
elo = df['ELO'][-10000:].mean()
count = len(df)
return {
'name' : df['player'].iloc[0],
'elo' : elo,
'count' : count,
}
if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions 0-player_counting/select_top_players.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import backend

import argparse
import bz2
import glob
import random
import os.path
import multiprocessing

import pandas

@backend.logged_main
def main():
parser = argparse.ArgumentParser(description='Read all the metadata and select top n players for training/validation/testing', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('inputs', help='input csvs dir')
parser.add_argument('output_train', help='output csv for training data')
parser.add_argument('num_train', type=int, help='num for main training')
parser.add_argument('output_val', help='output csv for validation data')
parser.add_argument('num_val', type=int, help='num for big validation run')
parser.add_argument('output_test', help='output csv for testing data')
parser.add_argument('num_test', type=int, help='num for holdout set')
parser.add_argument('--pool_size', type=int, help='Number of models to run in parallel', default = 48)
parser.add_argument('--min_elo', type=int, help='min elo to select', default = 1100)
parser.add_argument('--max_elo', type=int, help='max elo to select', default = 2000)
parser.add_argument('--seed', type=int, help='random seed', default = 1)
args = parser.parse_args()
random.seed(args.seed)

targets = glob.glob(os.path.join(args.inputs, '*csv.bz2'))

with multiprocessing.Pool(args.pool_size) as pool:
players = pool.starmap(check_player, ((t, args.min_elo, args.max_elo) for t in targets))

players_top = sorted(
(p for p in players if p is not None),
key = lambda x : x[1],
reverse=True,
)[:args.num_train + args.num_val + args.num_test]

random.shuffle(players_top)

write_output_file(args.output_train, args.num_train, players_top)
write_output_file(args.output_val, args.num_val, players_top)
write_output_file(args.output_test, args.num_test, players_top)

def write_output_file(path, count, targets):
with open(path, 'wt') as f:
f.write("player,count,ELO\n")
for i in range(count):
t = targets.pop()
f.write(f"{t[0]},{t[1]},{t[2]}\n")

def check_player(path, min_elo, max_elo):
df = pandas.read_csv(path, low_memory=False)
elo = df['ELO'][-10000:].mean()
count = len(df)
if elo > min_elo and elo < max_elo:
return path.split('/')[-1].split('.')[0], count, elo
else:
return None

if __name__ == "__main__":
main()
Loading

0 comments on commit 012c217

Please sign in to comment.