diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml index 07dd29a33ad68..85b61e2c9867c 100644 --- a/buildSrc/src/main/resources/checkstyle_suppressions.xml +++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml @@ -441,7 +441,6 @@ - diff --git a/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java index b7481e78496e5..1054721535eac 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -67,8 +67,10 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; +import java.nio.charset.CharacterCodingException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -163,7 +165,8 @@ public static CharArraySet parseStemExclusion(Settings settings, CharArraySet de NAMED_STOP_WORDS = unmodifiableMap(namedStopWords); } - public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, Map> namedWords, boolean ignoreCase) { + public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, + Map> namedWords, boolean ignoreCase) { String value = settings.get(name); if (value != null) { if ("_none_".equals(value)) { @@ -237,12 +240,17 @@ public static List getWordList(Environment env, Settings settings, Strin } } - final Path wordListFile = env.configFile().resolve(wordListPath); + final Path path = env.configFile().resolve(wordListPath); - try (BufferedReader reader = FileSystemUtils.newBufferedReader(wordListFile.toUri().toURL(), StandardCharsets.UTF_8)) { + try (BufferedReader reader = FileSystemUtils.newBufferedReader(path.toUri().toURL(), StandardCharsets.UTF_8)) { return loadWordList(reader, "#"); + } catch (CharacterCodingException ex) { + String message = String.format(Locale.ROOT, + "Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded", + settingPrefix, path.toString()); + throw new IllegalArgumentException(message, ex); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix); + String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString()); throw new IllegalArgumentException(message, ioe); } } @@ -256,7 +264,7 @@ public static List loadWordList(Reader reader, String comment) throws IO } else { br = new BufferedReader(reader); } - String word = null; + String word; while ((word = br.readLine()) != null) { if (!Strings.hasText(word)) { continue; @@ -283,13 +291,16 @@ public static Reader getReaderFromFile(Environment env, Settings settings, Strin if (filePath == null) { return null; } - final Path path = env.configFile().resolve(filePath); - try { return FileSystemUtils.newBufferedReader(path.toUri().toURL(), StandardCharsets.UTF_8); + } catch (CharacterCodingException ex) { + String message = String.format(Locale.ROOT, + "Unsupported character encoding detected while reading %s_path: %s files must be UTF-8 encoded", + settingPrefix, path.toString()); + throw new IllegalArgumentException(message, ex); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix); + String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString()); throw new IllegalArgumentException(message, ioe); } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java index 061e0d9d29fc7..37943773cef2b 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java @@ -21,8 +21,23 @@ import org.apache.lucene.analysis.util.CharArraySet; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; import org.elasticsearch.test.ESTestCase; +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.MalformedInputException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; + import static org.elasticsearch.common.settings.Settings.settingsBuilder; import static org.hamcrest.Matchers.is; @@ -42,4 +57,55 @@ public void testParseStemExclusion() { assertThat(set.contains("bar"), is(true)); assertThat(set.contains("baz"), is(false)); } + + public void testParseNonExistingFile() { + Path tempDir = createTempDir(); + Settings nodeSettings = Settings.builder() + .put("foo.bar_path", tempDir.resolve("foo.dict")) + .put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + Environment env = new Environment(nodeSettings); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, + () -> Analysis.getWordList(env, nodeSettings, "foo.bar")); + assertEquals("IOException while reading foo.bar_path: " + tempDir.resolve("foo.dict").toString(), ex.getMessage()); + assertTrue(ex.getCause().toString(), ex.getCause() instanceof FileNotFoundException + || ex.getCause() instanceof NoSuchFileException); + } + + + public void testParseFalseEncodedFile() throws IOException { + Path tempDir = createTempDir(); + Path dict = tempDir.resolve("foo.dict"); + Settings nodeSettings = Settings.builder() + .put("foo.bar_path", dict) + .put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + try (OutputStream writer = Files.newOutputStream(dict)) { + writer.write(new byte[]{(byte) 0xff, 0x00, 0x00}); // some invalid UTF-8 + writer.write('\n'); + } + Environment env = new Environment(nodeSettings); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, + () -> Analysis.getWordList(env, nodeSettings, "foo.bar")); + assertEquals("Unsupported character encoding detected while reading foo.bar_path: " + tempDir.resolve("foo.dict").toString() + + " - files must be UTF-8 encoded" , ex.getMessage()); + assertTrue(ex.getCause().toString(), ex.getCause() instanceof MalformedInputException + || ex.getCause() instanceof CharacterCodingException); + } + + public void testParseWordList() throws IOException { + Path tempDir = createTempDir(); + Path dict = tempDir.resolve("foo.dict"); + Settings nodeSettings = Settings.builder() + .put("foo.bar_path", dict) + .put(Environment.PATH_HOME_SETTING.getKey(), tempDir).build(); + try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) { + writer.write("hello"); + writer.write('\n'); + writer.write("world"); + writer.write('\n'); + } + Environment env = new Environment(nodeSettings); + List wordList = Analysis.getWordList(env, nodeSettings, "foo.bar"); + assertEquals(Arrays.asList("hello", "world"), wordList); + + } }