forked from pemistahl/lingua-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_test.go
166 lines (142 loc) · 6.12 KB
/
example_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/*
* Copyright © 2021 Peter M. Stahl [email protected]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lingua_test
import (
"fmt"
"github.com/pemistahl/lingua-go"
)
func Example_basic() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
lingua.Spanish,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
Build()
if language, exists := detector.DetectLanguageOf("languages are awesome"); exists {
fmt.Println(language)
}
// Output: English
}
// By default, Lingua returns the most likely language for a given input text.
// However, there are certain words that are spelled the same in more than one
// language. The word `prologue`, for instance, is both a valid English and
// French word. Lingua would output either English or French which might be
// wrong in the given context. For cases like that, it is possible to specify a
// minimum relative distance that the logarithmized and summed up probabilities
// for each possible language have to satisfy. It can be stated as seen below.
//
// Be aware that the distance between the language probabilities is dependent on
// the length of the input text. The longer the input text, the larger the
// distance between the languages. So if you want to classify very short text
// phrases, do not set the minimum relative distance too high. Otherwise Unknown
// will be returned most of the time as in the example below. This is the return
// value for cases where language detection is not reliably possible.
func Example_minimumRelativeDistance() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
lingua.Spanish,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
WithMinimumRelativeDistance(0.25).
Build()
language, exists := detector.DetectLanguageOf("languages are awesome")
fmt.Println(language)
fmt.Println(exists)
// Output:
// Unknown
// false
}
// Knowing about the most likely language is nice but how reliable is the
// computed likelihood? And how less likely are the other examined languages in
// comparison to the most likely one? In the example below, a slice of
// ConfidenceValue is returned containing all possible languages sorted by their
// confidence value in descending order. The values that this method computes are
// part of a relative confidence metric, not of an absolute one. Each value is a
// number between 0.0 and 1.0. The most likely language is always returned with
// value 1.0. All other languages get values assigned which are lower than 1.0,
// denoting how less likely those languages are in comparison to the most likely
// language.
//
// The slice returned by this method does not necessarily contain all
// languages which the calling instance of LanguageDetector was built from.
// If the rule-based engine decides that a specific language is truly
// impossible, then it will not be part of the returned slice. Likewise,
// if no ngram probabilities can be found within the detector's languages
// for the given input text, the returned slice will be empty.
// The confidence value for each language not being part of the returned
// slice is assumed to be 0.0.
func Example_confidenceValues() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
lingua.Spanish,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
Build()
confidenceValues := detector.ComputeLanguageConfidenceValues("languages are awesome")
for _, elem := range confidenceValues {
fmt.Printf("%s: %.2f\n", elem.Language(), elem.Value())
}
// Output:
// English: 1.00
// French: 0.79
// German: 0.75
// Spanish: 0.72
}
// By default, Lingua uses lazy-loading to load only those language models on
// demand which are considered relevant by the rule-based filter engine. For web
// services, for instance, it is rather beneficial to preload all language models
// into memory to avoid unexpected latency while waiting for the service response.
// If you want to enable the eager-loading mode, you can do it as seen below.
// Multiple instances of LanguageDetector share the same language models in
// memory which are accessed asynchronously by the instances.
func Example_eagerLoading() {
lingua.NewLanguageDetectorBuilder().
FromAllLanguages().
WithPreloadedLanguageModels().
Build()
}
// There might be classification tasks where you know beforehand that your language
// data is definitely not written in Latin, for instance. The detection accuracy
// can become better in such cases if you exclude certain languages from the
// decision process or just explicitly include relevant languages.
func Example_builderApi() {
// Including all languages available in the library
// consumes at least 2GB of memory and might
// lead to slow runtime performance.
lingua.NewLanguageDetectorBuilder().FromAllLanguages()
// Include only languages that are not yet extinct
// (= currently excludes Latin).
lingua.NewLanguageDetectorBuilder().FromAllSpokenLanguages()
// Include only languages written with Cyrillic script.
lingua.NewLanguageDetectorBuilder().FromAllLanguagesWithCyrillicScript()
// Exclude only the Spanish language from the decision algorithm.
lingua.NewLanguageDetectorBuilder().FromAllLanguagesWithout(lingua.Spanish)
// Only decide between English and German.
lingua.NewLanguageDetectorBuilder().FromLanguages(lingua.English, lingua.German)
// Select languages by ISO 639-1 code.
lingua.NewLanguageDetectorBuilder().FromIsoCodes639_1(lingua.EN, lingua.DE)
// Select languages by ISO 639-3 code.
lingua.NewLanguageDetectorBuilder().FromIsoCodes639_3(lingua.ENG, lingua.DEU)
}