-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Tamil analyzer based on snowball stemmer and TamilNLP stopwords
- Loading branch information
Showing
5 changed files
with
347 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
133 changes: 133 additions & 0 deletions
133
lucene/analysis/common/src/java/org/apache/lucene/analysis/ta/TamilAnalyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ta; | ||
|
||
import java.io.IOException; | ||
import java.io.Reader; | ||
import java.io.UncheckedIOException; | ||
import org.apache.lucene.analysis.CharArraySet; | ||
import org.apache.lucene.analysis.LowerCaseFilter; | ||
import org.apache.lucene.analysis.StopFilter; | ||
import org.apache.lucene.analysis.StopwordAnalyzerBase; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.core.DecimalDigitFilter; | ||
import org.apache.lucene.analysis.in.IndicNormalizationFilter; | ||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; | ||
import org.apache.lucene.analysis.snowball.SnowballFilter; | ||
import org.apache.lucene.analysis.standard.StandardTokenizer; | ||
import org.tartarus.snowball.ext.TamilStemmer; | ||
|
||
/** | ||
* Analyzer for Tamil. | ||
* | ||
* @since 9.0 | ||
*/ | ||
public final class TamilAnalyzer extends StopwordAnalyzerBase { | ||
private final CharArraySet stemExclusionSet; | ||
|
||
/** | ||
* File containing default Tamil stopwords. | ||
* | ||
* <p>Default stopword list is from https://github.com/AshokR/TamilNLP (Apache 2 License) | ||
*/ | ||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; | ||
|
||
private static final String STOPWORDS_COMMENT = "#"; | ||
|
||
/** | ||
* Returns an unmodifiable instance of the default stop-words set. | ||
* | ||
* @return an unmodifiable instance of the default stop-words set. | ||
*/ | ||
public static CharArraySet getDefaultStopSet() { | ||
return DefaultSetHolder.DEFAULT_STOP_SET; | ||
} | ||
|
||
/** | ||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the | ||
* static final set the first time.; | ||
*/ | ||
private static class DefaultSetHolder { | ||
static final CharArraySet DEFAULT_STOP_SET; | ||
|
||
static { | ||
try { | ||
DEFAULT_STOP_SET = | ||
loadStopwordSet(false, TamilAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); | ||
} catch (IOException ex) { | ||
// default set should always be present as it is part of the | ||
// distribution (JAR) | ||
throw new UncheckedIOException("Unable to load default stopword set", ex); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words | ||
* | ||
* @param stopwords a stopword set | ||
* @param stemExclusionSet a stemming exclusion set | ||
*/ | ||
public TamilAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { | ||
super(stopwords); | ||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words | ||
* | ||
* @param stopwords a stopword set | ||
*/ | ||
public TamilAnalyzer(CharArraySet stopwords) { | ||
this(stopwords, CharArraySet.EMPTY_SET); | ||
} | ||
|
||
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ | ||
public TamilAnalyzer() { | ||
this(DefaultSetHolder.DEFAULT_STOP_SET); | ||
} | ||
|
||
/** | ||
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all | ||
* the text in the provided {@link Reader}. | ||
* | ||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link | ||
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter}, | ||
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is | ||
* provided, {@link SnowballFilter}, and Tamil Stop words | ||
*/ | ||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) { | ||
final Tokenizer source = new StandardTokenizer(); | ||
TokenStream result = new LowerCaseFilter(source); | ||
result = new DecimalDigitFilter(result); | ||
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); | ||
result = new IndicNormalizationFilter(result); | ||
result = new StopFilter(result, stopwords); | ||
result = new SnowballFilter(result, new TamilStemmer()); | ||
return new TokenStreamComponents(source, result); | ||
} | ||
|
||
@Override | ||
protected TokenStream normalize(String fieldName, TokenStream in) { | ||
TokenStream result = new LowerCaseFilter(in); | ||
result = new DecimalDigitFilter(result); | ||
result = new IndicNormalizationFilter(result); | ||
return result; | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
lucene/analysis/common/src/java/org/apache/lucene/analysis/ta/package-info.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
/** Analyzer for Tamil. */ | ||
package org.apache.lucene.analysis.ta; |
126 changes: 126 additions & 0 deletions
126
lucene/analysis/common/src/resources/org/apache/lucene/analysis/ta/stopwords.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
# tamil stopwords from https://github.com/AshokR/TamilNLP | ||
ஒரு | ||
என்று | ||
மற்றும் | ||
இந்த | ||
இது | ||
என்ற | ||
கொண்டு | ||
என்பது | ||
பல | ||
ஆகும் | ||
அல்லது | ||
அவர் | ||
நான் | ||
உள்ள | ||
அந்த | ||
இவர் | ||
என | ||
முதல் | ||
என்ன | ||
இருந்து | ||
சில | ||
என் | ||
போன்ற | ||
வேண்டும் | ||
வந்து | ||
இதன் | ||
அது | ||
அவன் | ||
தான் | ||
பலரும் | ||
என்னும் | ||
மேலும் | ||
பின்னர் | ||
கொண்ட | ||
இருக்கும் | ||
தனது | ||
உள்ளது | ||
போது | ||
என்றும் | ||
அதன் | ||
தன் | ||
பிறகு | ||
அவர்கள் | ||
வரை | ||
அவள் | ||
நீ | ||
ஆகிய | ||
இருந்தது | ||
உள்ளன | ||
வந்த | ||
இருந்த | ||
மிகவும் | ||
இங்கு | ||
மீது | ||
ஓர் | ||
இவை | ||
இந்தக் | ||
பற்றி | ||
வரும் | ||
வேறு | ||
இரு | ||
இதில் | ||
போல் | ||
இப்போது | ||
அவரது | ||
மட்டும் | ||
இந்தப் | ||
எனும் | ||
மேல் | ||
பின் | ||
சேர்ந்த | ||
ஆகியோர் | ||
எனக்கு | ||
இன்னும் | ||
அந்தப் | ||
அன்று | ||
ஒரே | ||
மிக | ||
அங்கு | ||
பல்வேறு | ||
விட்டு | ||
பெரும் | ||
அதை | ||
பற்றிய | ||
உன் | ||
அதிக | ||
அந்தக் | ||
பேர் | ||
இதனால் | ||
அவை | ||
அதே | ||
ஏன் | ||
முறை | ||
யார் | ||
என்பதை | ||
எல்லாம் | ||
மட்டுமே | ||
இங்கே | ||
அங்கே | ||
இடம் | ||
இடத்தில் | ||
அதில் | ||
நாம் | ||
அதற்கு | ||
எனவே | ||
பிற | ||
சிறு | ||
மற்ற | ||
விட | ||
எந்த | ||
எனவும் | ||
எனப்படும் | ||
எனினும் | ||
அடுத்த | ||
இதனை | ||
இதை | ||
கொள்ள | ||
இந்தத் | ||
இதற்கு | ||
அதனால் | ||
தவிர | ||
போல | ||
வரையில் | ||
சற்று | ||
எனக் |
67 changes: 67 additions & 0 deletions
67
lucene/analysis/common/src/test/org/apache/lucene/analysis/ta/TestTamilAnalyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ta; | ||
|
||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.BaseTokenStreamTestCase; | ||
import org.apache.lucene.analysis.CharArraySet; | ||
|
||
/** Tests the TamilAnalyzer */ | ||
public class TestTamilAnalyzer extends BaseTokenStreamTestCase { | ||
/** This test fails with NPE when the stopwords file is missing in classpath */ | ||
public void testResourcesAvailable() { | ||
new TamilAnalyzer().close(); | ||
} | ||
|
||
/** test that snowball stemmer is hooked in */ | ||
public void testStemming() throws Exception { | ||
Analyzer a = new TamilAnalyzer(); | ||
// friend | ||
checkOneTerm(a, "நண்பன்", "நண்"); | ||
// friends | ||
checkOneTerm(a, "நண்பர்கள்", "நண்"); | ||
a.close(); | ||
} | ||
|
||
public void testExclusionSet() throws Exception { | ||
CharArraySet exclusionSet = new CharArraySet(asSet("நண்பர்கள்"), false); | ||
Analyzer a = new TamilAnalyzer(TamilAnalyzer.getDefaultStopSet(), exclusionSet); | ||
checkOneTerm(a, "நண்பர்கள்", "நண்பர்கள்"); | ||
a.close(); | ||
} | ||
|
||
/** test we fold digits to latin-1 */ | ||
public void testDigits() throws Exception { | ||
TamilAnalyzer a = new TamilAnalyzer(); | ||
checkOneTerm(a, "௧௨௩௪", "1234"); | ||
a.close(); | ||
} | ||
|
||
/** tamil doesn't have case, but test we case-fold any latin-1 etc */ | ||
public void testLowerCase() throws Exception { | ||
TamilAnalyzer a = new TamilAnalyzer(); | ||
checkOneTerm(a, "FIFA", "fifa"); | ||
a.close(); | ||
} | ||
|
||
/** blast some random strings through the analyzer */ | ||
public void testRandomStrings() throws Exception { | ||
Analyzer analyzer = new TamilAnalyzer(); | ||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); | ||
analyzer.close(); | ||
} | ||
} |