-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocess_data.sh
executable file
·68 lines (56 loc) · 1.51 KB
/
preprocess_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env bash
#===============================================================================
#
# FILE: preprocess_data.sh
#
# USAGE: ./preprocess_data.sh
#
# DESCRIPTION: Preprocess tinyshakespeare corpus
#
# NOTES: ---
# CREATED: 06/20/15 16:49
# REVISION: ---
#===============================================================================
set -o nounset # Treat unset variables as an error
set -e
datadir=data/tinyshakespeare
infile=${datadir}/tinyshakespeare.txt
outfile=${datadir}/tinyshakespeare_char.txt
python src/util/preprocess_data.py \
--infile ${infile} \
--outfile ${outfile} \
--lowercase \
--empty-line
#--clean-token \
# split data into train, valid and test
# train 1-2000 lines
# valid 2001-3000 lines
# the rest is kept as test set
trainfile=${datadir}/train.txt
vocfile=${datadir}/voc.txt
validfile=${datadir}/valid.txt
testfile=${datadir}/test.txt
train_lb=1
train_ub=2000
valid_lb=2001
valid_ub=3000
test_lb=3001
# split training set
awk -v lb=${train_lb} -v ub=${train_ub} \
'NR>=lb && NR <=ub' \
${outfile} > ${trainfile}
# split valid set
awk -v lb=${valid_lb} -v ub=${valid_ub} \
'NR>=lb && NR <=ub' \
${outfile} > ${validfile}
# split test set
awk -v lb=${test_lb} \
'NR>=lb' \
${outfile} > ${testfile}
# build vocabulary
sed "s/ /\n/g" ${trainfile} | \
sort | uniq > ${vocfile}
# append eos, unk, append tokens
echo "</s>" >> ${vocfile}
echo "<unk>" >> ${vocfile}
echo "<append>" >> ${vocfile}