-
Notifications
You must be signed in to change notification settings - Fork 0
/
stanford_segmenter.py
136 lines (109 loc) · 4.46 KB
/
stanford_segmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Chinese Segmenter
#
# Copyright (C) 2001-2014 NLTK Project
# Author: 52nlp <[email protected]>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals, print_function
import tempfile
import os
import json
from subprocess import PIPE
from nltk import compat
from nltk.internals import find_jar, config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
class StanfordSegmenter(TokenizerI):
r"""
Interface to the Stanford Segmenter
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> segmenter = StanfordSegmenter(path_to_jar="stanford-segmenter-3.4.1.jar", path_to_sihan_corpora_dict="./data", path_to_model="./data/pku.gz", path_to_dict="./data/dict-chris6.ser.gz")
>>> sentence = u"这是斯坦福中文分词器测试"
>>> segmenter.segment(sentence)
>>> u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
>>> segmenter.segment_file("test.simp.utf8")
>>> u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...
"""
_JAR = 'stanford-segmenter.jar'
def __init__(self, path_to_jar=None,
path_to_sihan_corpora_dict=None,
path_to_model=None, path_to_dict=None,
encoding='UTF-8', options=None,
verbose=False, java_options='-mx2g'):
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
env_vars=('STANFORD_SEGMENTER',),
searchpath=(),
verbose=verbose
)
self._sihan_corpora_dict = path_to_sihan_corpora_dict
self._model = path_to_model
self._dict = path_to_dict
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def segment_file(self, input_file_path):
"""
"""
cmd = [
'edu.stanford.nlp.ie.crf.CRFClassifier',
'-sighanCorporaDict', self._sihan_corpora_dict,
'-textFile', input_file_path,
'-sighanPostProcessing', 'true',
'-keepAllWhitespaces', 'false',
'-loadClassifier', self._model,
'-serDictionary', self._dict
]
stdout = self._execute(cmd)
return stdout
def segment(self, tokens):
return self.segment_sents([tokens])
def segment_sents(self, sentences):
"""
"""
encoding = self._encoding
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
if isinstance(_input, compat.text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
'edu.stanford.nlp.ie.crf.CRFClassifier',
'-sighanCorporaDict', self._sihan_corpora_dict,
'-textFile', self._input_file_path,
'-sighanPostProcessing', 'true',
'-keepAllWhitespaces', 'false',
'-loadClassifier', self._model,
'-serDictionary', self._dict
]
stdout = self._execute(cmd)
# Delete the temporary file
os.unlink(self._input_file_path)
return stdout
def _execute(self, cmd, verbose=False):
encoding = self._encoding
cmd.extend(['-inputEncoding', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(['-options', self._options_cmd])
default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
stdout, _stderr = java(cmd,classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
def setup_module(module):
from nose import SkipTest
try:
StanfordSegmenter()
except LookupError:
raise SkipTest('doctests from nltk.tokenize.stanford_segmenter are skipped because the stanford segmenter jar doesn\'t exist')