-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add semcor pipeline, dimensionality parameter, epoch parameter
- Loading branch information
garrafao
committed
Jan 5, 2020
1 parent
7b72046
commit 361c173
Showing
47 changed files
with
521 additions
and
200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import sys | ||
sys.path.append('./modules/') | ||
|
||
import sys | ||
from sklearn.metrics import average_precision_score | ||
from collections import Counter | ||
from docopt import docopt | ||
import numpy as np | ||
import logging | ||
import time | ||
|
||
|
||
def main(): | ||
""" | ||
Calculate the Average Precision (AP) of full rank of targets. | ||
""" | ||
|
||
# Get the arguments | ||
args = docopt("""Calculate the Average Precision (AP) of full rank of targets. | ||
Usage: | ||
ap.py <classFile> <resultFile> <classFileName> <resultFileName> | ||
<classFile> = file with gold class assignments | ||
<resultFile> = file with values assigned to targets | ||
<classFileName> = name of class file to print | ||
<resultFileName> = name of result file to print | ||
Note: | ||
Assumes tap-separated CSV files as input. Assumes same number and order of rows. classFile must contain class assignments in first column. resultFile must contain targets in first column and values in second column. Targets with nan are ignored. | ||
""") | ||
|
||
classFile = args['<classFile>'] | ||
resultFile = args['<resultFile>'] | ||
classFileName = args['<classFileName>'] | ||
resultFileName = args['<resultFileName>'] | ||
|
||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | ||
logging.info(__file__.upper()) | ||
start_time = time.time() | ||
|
||
# Get gold data | ||
with open(classFile, 'r', encoding='utf-8') as f_in: | ||
classes = [float(line.strip()) for line in f_in] | ||
|
||
# Get predictions | ||
with open(resultFile, 'r', encoding='utf-8') as f_in: | ||
target2values = {line.strip().split('\t')[0]:float(line.strip().split('\t')[1]) for line in f_in} | ||
|
||
target2class = {target:classes[i] for i, target in enumerate(target2values)} | ||
|
||
# Read in values, exclude nan and targets not present in resultFile | ||
gold = np.array([target2class[target] for (target, value) in target2values.items() if not np.isnan(value)]) | ||
values = np.array([value for (target, value) in target2values.items() if not np.isnan(value)]) | ||
targets = np.array([target for (target, value) in target2values.items() if not np.isnan(value)]) | ||
|
||
if len(classes)!=len(list(gold)): | ||
print('nan encountered!') | ||
|
||
# Compute average precision | ||
try: | ||
ap = average_precision_score(gold, values) | ||
mc = Counter(gold)[1.0] | ||
rb = mc/len(gold) # approximate random baseline | ||
except IndexError as e: | ||
logging.info(e) | ||
ap, rb = float('nan'), float('nan') | ||
|
||
print('\t'.join((classFileName, resultFileName, str(ap), str(rb)))) | ||
|
||
logging.info("--- %s seconds ---" % (time.time() - start_time)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import sys | ||
sys.path.append('./modules/') | ||
|
||
from docopt import docopt | ||
import logging | ||
import time | ||
import random | ||
|
||
def main(): | ||
""" | ||
Measure assigning random values to targets (as baseline). | ||
""" | ||
|
||
# Get the arguments | ||
args = docopt("""Measure assigning random values to targets (as baseline). | ||
Usage: | ||
rand.py [(-f | -s)] (-r) <testset> <outPath> | ||
<testset> = path to file with tab-separated word pairs | ||
<outPath> = output path for result file | ||
Options: | ||
-f, --fst write only first target in output file | ||
-s, --scd write only second target in output file | ||
-r, --rel assign random real numbers between 0 and 1 | ||
""") | ||
|
||
is_fst = args['--fst'] | ||
is_scd = args['--scd'] | ||
is_rel = args['--rel'] | ||
testset = args['<testset>'] | ||
outPath = args['<outPath>'] | ||
|
||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | ||
logging.info(__file__.upper()) | ||
start_time = time.time() | ||
|
||
# Load targets | ||
with open(testset, 'r', encoding='utf-8') as f_in: | ||
targets = [(line.strip().split('\t')[0],line.strip().split('\t')[1]) for line in f_in] | ||
|
||
scores = {} | ||
for (t1, t2) in targets: | ||
|
||
if is_rel: | ||
score = random.uniform(0, 1) | ||
|
||
scores[(t1, t2)] = score | ||
|
||
|
||
with open(outPath, 'w', encoding='utf-8') as f_out: | ||
for (t1, t2) in targets: | ||
if is_fst: # output only first target string | ||
f_out.write('\t'.join((t1, str(scores[(t1, t2)])+'\n'))) | ||
elif is_scd: # output only second target string | ||
f_out.write('\t'.join((t2, str(scores[(t1, t2)])+'\n'))) | ||
else: # standard outputs both target strings | ||
f_out.write('\t'.join(('%s,%s' % (t1,t2), str(scores[(t1, t2)])+'\n'))) | ||
|
||
|
||
logging.info("--- %s seconds ---" % (time.time() - start_time)) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
|
||
## Make target input files | ||
|
||
if [ ! -f $targets ]; | ||
then | ||
echo -e "Error: No target file found at $targets." | ||
exit 0 | ||
fi | ||
|
||
if [ ! -f $testset ]; | ||
then | ||
for i in `cat $targets` | ||
do | ||
echo -e "$i\t$i" >> $testset # general input | ||
done | ||
fi | ||
|
||
if [ ! -f $testsetwi ]; | ||
then | ||
for i in `cat $targets` | ||
do | ||
echo -e "${i}_\t$i" >> $testsetwi # input for word injection | ||
done | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,31 @@ | ||
shopt -s extglob # For more powerful regular expressions in shell | ||
|
||
### Define parameters ### | ||
declare -a corpDir1="corpora/durel/corpus1/" # directory for corpus1 files (all files in directory will be read) | ||
declare -a corpDir2="corpora/durel/corpus2/" # directory for corpus2 files (all files in directory will be read) | ||
declare -a wiCorpDir="corpora/durel/corpus_wi/" # directory for word-injected corpus (only needed for Word Injection) | ||
declare -a freqnorms=(26650530 40323497) # normalization constants for token frequency (total number of tokens in first and second corpus, *before cleaning*) | ||
declare -a typesnorms=(252437 796365) # normalization constants for number of context types (total number of types in first and second corpus, *before cleaning*) | ||
declare -a windowSizes=(2 5 10) # window sizes for all models | ||
declare -a ks=(5 1) # values for shifting parameter k | ||
declare -a ts=(0.001 None) # values for subsampling parameter t | ||
declare -a iterations=(1 2 3 4 5) # list of iterations, each item is one iteration, for five iterations define: iterations=(1 2 3 4 5) | ||
declare -a dim=300 # dimensionality of low-dimensional matrices (SVD/RI/SGNS) | ||
declare -a testset="testsets/durel/targets.tsv" # target words for which change scores should be predicted (one target per line repeated twice with tab-separation, i.e., 'word\tword') | ||
declare -a testsetwi="testsets/durel/targets_wi.tsv" # target words for Word Injection (one target per line, injected version in first column, non-injected version in second column, i.e., 'word_\tword') | ||
declare -a goldscorefile="testsets/durel/gold.tsv" # file with gold scores for target words in same order as targets in testsets | ||
corpDir1="corpora/durel/corpus1/" # directory for corpus1 files (all files in directory will be read) | ||
corpDir2="corpora/durel/corpus2/" # directory for corpus2 files (all files in directory will be read) | ||
wiCorpDir="corpora/durel/corpus_wi/" # directory for word-injected corpus (only needed for Word Injection) | ||
freqnorms=(26650530 40323497) # normalization constants for token frequency (total number of tokens in first and second corpus, *before cleaning*) | ||
typesnorms=(252437 796365) # normalization constants for number of context types (total number of types in first and second corpus, *before cleaning*) | ||
windowSizes=(2 5 10) # window sizes for all models | ||
ks=(5 1) # values for shifting parameter k | ||
ts=(0.001 None) # values for subsampling parameter t | ||
iterations=(1 2 3 4 5) # list of iterations, each item is one iteration, for five iterations define: iterations=(1 2 3 4 5) | ||
dims=(300) # dimensionality of low-dimensional matrices (SVD/RI/SGNS) | ||
eps=(5) # training epochs for SGNS | ||
targets="testsets/durel/targets.tsv" # target words for which change scores should be predicted (one target per line) | ||
testset="testsets/durel/targets_input.tsv" # target words in input format (one target per line repeated twice with tab-separation, i.e., 'word\tword', will be created) | ||
testsetwi="testsets/durel/targets_wi.tsv" # target words in word injection format (one target per line, injected version in first column, non-injected version in second column, i.e., 'word_\tword', will be created) | ||
goldrankfile="testsets/durel/rank.tsv" # file with gold scores for target words in same order as targets in testsets | ||
goldclassfile="" # file with gold classes for target words in same order as targets in testsets (leave undefined if non-existent) | ||
|
||
# Get normalization constants for dispersion measures | ||
declare -a freqnorm1=${freqnorms[0]} | ||
declare -a freqnorm2=${freqnorms[1]} | ||
declare -a typesnorm1=${typesnorms[0]} | ||
declare -a typesnorm2=${typesnorms[1]} | ||
freqnorm1=${freqnorms[0]} | ||
freqnorm2=${freqnorms[1]} | ||
typesnorm1=${typesnorms[0]} | ||
typesnorm2=${typesnorms[1]} | ||
|
||
### Make folder structure ### | ||
source scripts/make_folders.sh | ||
|
||
### Make target input files ### | ||
source scripts/make_targets.sh |
Oops, something went wrong.