LUCENE-10096: Tamil Analyzer (#292)

Add Tamil analyzer based on snowball stemmer and TamilNLP stopwords
apache · Sep 11, 2021 · 24aa45d · 24aa45d
1 parent 8bce765
commit 24aa45d
Show file tree

Hide file tree

Showing 5 changed files with 347 additions and 0 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -19,6 +19,8 @@ New Features
 
 * LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
 
+* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
+
 System Requirements
 
 * LUCENE-8738: Move to Java 11 as minimum Java version.

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ta/TamilAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ta/TamilAnalyzer.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ta;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UncheckedIOException;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.tartarus.snowball.ext.TamilStemmer;
+
+/**
+ * Analyzer for Tamil.
+ *
+ * @since 9.0
+ */
+public final class TamilAnalyzer extends StopwordAnalyzerBase {
+  private final CharArraySet stemExclusionSet;
+
+  /**
+   * File containing default Tamil stopwords.
+   *
+   * <p>Default stopword list is from https://github.com/AshokR/TamilNLP (Apache 2 License)
+   */
+  public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  private static final String STOPWORDS_COMMENT = "#";
+
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   *
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static CharArraySet getDefaultStopSet() {
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+   * static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET =
+            loadStopwordSet(false, TamilAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new UncheckedIOException("Unable to load default stopword set", ex);
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the given stop words
+   *
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a stemming exclusion set
+   */
+  public TamilAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
+    super(stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
+  }
+
+  /**
+   * Builds an analyzer with the given stop words
+   *
+   * @param stopwords a stopword set
+   */
+  public TamilAnalyzer(CharArraySet stopwords) {
+    this(stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
+  public TamilAnalyzer() {
+    this(DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+
+  /**
+   * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
+   * the text in the provided {@link Reader}.
+   *
+   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
+   *     StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
+   *     {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
+   *     provided, {@link SnowballFilter}, and Tamil Stop words
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    final Tokenizer source = new StandardTokenizer();
+    TokenStream result = new LowerCaseFilter(source);
+    result = new DecimalDigitFilter(result);
+    if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+    result = new IndicNormalizationFilter(result);
+    result = new StopFilter(result, stopwords);
+    result = new SnowballFilter(result, new TamilStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new LowerCaseFilter(in);
+    result = new DecimalDigitFilter(result);
+    result = new IndicNormalizationFilter(result);
+    return result;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ta/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ta/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Analyzer for Tamil. */
+package org.apache.lucene.analysis.ta;
diff --git a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ta/stopwords.txt b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ta/stopwords.txt
@@ -0,0 +1,126 @@
+# tamil stopwords from https://github.com/AshokR/TamilNLP
+ஒரு
+என்று
+மற்றும்
+இந்த
+இது
+என்ற
+கொண்டு
+என்பது
+பல
+ஆகும்
+அல்லது
+அவர்
+நான்
+உள்ள
+அந்த
+இவர்
+என
+முதல்
+என்ன
+இருந்து
+சில
+என்
+போன்ற
+வேண்டும்
+வந்து
+இதன்
+அது
+அவன்
+தான்
+பலரும்
+என்னும்
+மேலும்
+பின்னர்
+கொண்ட
+இருக்கும்
+தனது
+உள்ளது
+போது
+என்றும்
+அதன்
+தன்
+பிறகு
+அவர்கள்
+வரை
+அவள்
+நீ
+ஆகிய
+இருந்தது
+உள்ளன
+வந்த
+இருந்த
+மிகவும்
+இங்கு
+மீது
+ஓர்
+இவை
+இந்தக்
+பற்றி
+வரும்
+வேறு
+இரு
+இதில்
+போல்
+இப்போது
+அவரது
+மட்டும்
+இந்தப்
+எனும்
+மேல்
+பின்
+சேர்ந்த
+ஆகியோர்
+எனக்கு
+இன்னும்
+அந்தப்
+அன்று
+ஒரே
+மிக
+அங்கு
+பல்வேறு
+விட்டு
+பெரும்
+அதை
+பற்றிய
+உன்
+அதிக
+அந்தக்
+பேர்
+இதனால்
+அவை
+அதே
+ஏன்
+முறை
+யார்
+என்பதை
+எல்லாம்
+மட்டுமே
+இங்கே
+அங்கே
+இடம்
+இடத்தில்
+அதில்
+நாம்
+அதற்கு
+எனவே
+பிற
+சிறு
+மற்ற
+விட
+எந்த
+எனவும்
+எனப்படும்
+எனினும்
+அடுத்த
+இதனை
+இதை
+கொள்ள
+இந்தத்
+இதற்கு
+அதனால்
+தவிர
+போல
+வரையில்
+சற்று
+எனக்
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ta/TestTamilAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ta/TestTamilAnalyzer.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ta;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+
+/** Tests the TamilAnalyzer */
+public class TestTamilAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new TamilAnalyzer().close();
+  }
+
+  /** test that snowball stemmer is hooked in */
+  public void testStemming() throws Exception {
+    Analyzer a = new TamilAnalyzer();
+    // friend
+    checkOneTerm(a, "நண்பன்", "நண்");
+    // friends
+    checkOneTerm(a, "நண்பர்கள்", "நண்");
+    a.close();
+  }
+
+  public void testExclusionSet() throws Exception {
+    CharArraySet exclusionSet = new CharArraySet(asSet("நண்பர்கள்"), false);
+    Analyzer a = new TamilAnalyzer(TamilAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTerm(a, "நண்பர்கள்", "நண்பர்கள்");
+    a.close();
+  }
+
+  /** test we fold digits to latin-1 */
+  public void testDigits() throws Exception {
+    TamilAnalyzer a = new TamilAnalyzer();
+    checkOneTerm(a, "௧௨௩௪", "1234");
+    a.close();
+  }
+
+  /** tamil doesn't have case, but test we case-fold any latin-1 etc */
+  public void testLowerCase() throws Exception {
+    TamilAnalyzer a = new TamilAnalyzer();
+    checkOneTerm(a, "FIFA", "fifa");
+    a.close();
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer analyzer = new TamilAnalyzer();
+    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+    analyzer.close();
+  }
+}