-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathcqasys_annie.bash
executable file
·173 lines (148 loc) · 3.64 KB
/
cqasys_annie.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env bash
################
# environments #
################
base_dir=`dirname $0`
if [ "`uname`" = "Linux" ]; then
if [ ! -d /tmp/venv_annie ]; then
(>&2 echo "0) setting environments")
tar -C /tmp -xzf ${base_dir}/env/venv_annie.tar.gz
fi
source /tmp/venv_annie/bin/activate
fi
#############
# arguments #
#############
function print_usage() {
echo "Usage: `basename $0` [options]"
echo "Options:"
echo " -h, --help show this help message and exit"
echo " -i FILE input file"
echo " --rsc-dir=DIR resource dir <default: ${base_dir}/rsc>"
echo " --output=FILE output file <default: result.json>"
if [ -z "$1" ]; then
exit 0
else
echo
echo $1
exit -1
fi
}
OPTS=`python ${base_dir}/lib/getopts.py "hi:" "help,rsc-dir=,output=" $@`
if [[ $? -ne 0 ]]; then
print_usage "option parse error"
exit -1
else
eval set -- "$OPTS"
fi
while [[ $# -ge 1 ]]; do
case $1 in
-h|--help)
print_usage
;;
-i)
input="$2"
shift
;;
--rsc-dir)
rsc_dir="$2"
shift
;;
--output)
output="$2"
shift
;;
--) break ;;
esac
shift
done
# resource dir
if [ "${rsc_dir}" = "" ]; then
rsc_dir=${base_dir}/rsc
fi
if [ ! -d ${rsc_dir} ]; then
echo "${rsc_dir} not found"
exit 1
fi
# input
if [ "${input}" = "" ]; then
print_usage "-i option is required"
exit 2
fi
# output
if [ "${output}" = "" ]; then
output=result.json
fi
############
# settings #
############
export LANG=ko_KR.UTF-8
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${base_dir}/lib
crf_model=${rsc_dir}/crf.model
if [ ! -f ${crf_model} ]; then
if [ -f ${crf_model}.gz ]; then
gzip -c -d ${crf_model}.gz > ${crf_model}
else
echo "CRF model file not found"
exit 3
fi
fi
w2v_dic=${rsc_dir}/word2vec.pkl
if [ ! -f {w2v_dic} ]; then
if [ -f ${w2v_dic}.gz ]; then
gzip -c -d ${w2v_dic}.gz > ${w2v_dic}
else
echo "word2vec file not found"
exit 4
fi
fi
#######
# run #
#######
bin_dir=${base_dir}/bin
# 1) JSON 코퍼스를 CRF 자질 파일로 변환
(>&2 echo "1) convert JSON input to CRF feature")
feat_file=${output}.crffeat
python ${bin_dir}/json2feat.py -g ${rsc_dir}/gazette.annie \
--input=${input} \
--output=${feat_file}
if [ $? -ne 0 ]; then
echo "fail to extract features from ${input}"
exit 5
fi
# 2) crfsuite를 이용해 태깅
(>&2 echo "2) tag with CRF model")
crfsuite_bin=${bin_dir}/crfsuite.`uname`
tag_file=${output}.crftag
${crfsuite_bin} tag -m ${crf_model} ${feat_file} > ${tag_file}
if [ $? -eq 0 ]; then
rm -f ${feat_file}
else
echo "fail to tag with CRF model"
exit 6
fi
# 3) 태깅한 파일을 JSON 포맷으로 변환
(>&2 echo "3) convert IOB2 to JSON")
crf_predict=${output}.crf
python ${bin_dir}/iob2json.py -j ${input} \
--input=${tag_file} \
--output=${crf_predict}
if [ $? -eq 0 ]; then
rm -f ${tag_file}
else
echo "fail to make JSON with tagged file"
exit 7
fi
# 4) CRF 결과에 SVM 모델을 이용해 인명 추가 태깅
(>&2 echo "4) tag PS NEs")
python ${bin_dir}/tag_ps.py -w ${w2v_dic} \
-m ${rsc_dir}/nusvc_model.pkl \
--input=${crf_predict} \
--output=${output}
if [ $? -eq 0 ]; then
rm -f ${crf_predict}
else
echo "fail to make JSON with tagged file"
exit 8
fi