-
-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
new dictionary format that supports syllabaries
- Loading branch information
Showing
65 changed files
with
756 additions
and
342 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
download/* binary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import java.nio.charset.StandardCharsets | ||
import java.util.zip.ZipEntry | ||
import java.util.zip.ZipOutputStream | ||
|
||
apply from: 'dictionary-tools.gradle' | ||
|
||
ext.convertDictionaries = { definitionsInputDir, dictionariesInputDir, dictionariesOutputDir, dictionariesDownloadDir -> | ||
int errorCount = 0 | ||
|
||
def errorStream = fileTree(dir: definitionsInputDir).getFiles().parallelStream().map { definition -> | ||
def (_, sounds, __, locale, dictionaryFile, langFileErrorCount, langFileErrorMsg) = parseLanguageDefintion(definition, dictionariesInputDir) | ||
errorCount += langFileErrorCount | ||
if (!langFileErrorMsg.isEmpty()) { | ||
return langFileErrorMsg | ||
} | ||
|
||
def (conversionErrorCount, conversionErrorMessages) = convertDictionary(dictionaryFile, dictionariesOutputDir, DICTIONARY_OUTPUT_EXTENSION, sounds, locale, MAX_ERRORS, CSV_DELIMITER) | ||
errorCount += conversionErrorCount | ||
if (!conversionErrorMessages.isEmpty()) { | ||
return conversionErrorMessages | ||
} | ||
|
||
return "" | ||
} | ||
|
||
String errorsMsg = errorStream.reduce("", String::concat) | ||
if (errorsMsg) { | ||
throw new GradleException(errorsMsg) | ||
} | ||
} | ||
|
||
|
||
// this cannot be static, because DictionaryTools will not be visible | ||
def convertDictionary(File dictionaryFile, String dictionariesOutputDir, String outputDictionaryExtension, HashMap<String, String> sounds, Locale locale, int maxErrors, String csvDelimiter) { | ||
int errorCount = 0 | ||
String errorMsg = '' | ||
|
||
List<String> fileContents = dictionaryFile.readLines() | ||
LinkedHashMap<String, ArrayList<String>> outputDictionary = new LinkedHashMap<>() | ||
|
||
for (int lineNumber = 1; lineNumber <= fileContents.size() && errorCount < maxErrors; lineNumber++) { | ||
String line = fileContents.get(lineNumber - 1) | ||
|
||
def (word, transcription) = DictionaryTools.getDictionaryLineData(line, csvDelimiter) | ||
|
||
String digitSequence = "" | ||
try { | ||
def transcribedWord = transcription.isEmpty() ? word : transcription | ||
digitSequence = DictionaryTools.wordToDigitSequence(locale, transcribedWord, sounds, !transcription.isEmpty()) | ||
} catch (IllegalArgumentException e) { | ||
errorCount++ | ||
errorMsg += "Dictionary '${dictionaryFile.name}' is invalid. Failed generating digit sequence for word '${word}' on line ${lineNumber}. ${e.message}\n" | ||
} | ||
|
||
if (errorCount == 0) { | ||
if (!outputDictionary.containsKey(digitSequence)) { | ||
outputDictionary.put(digitSequence, new ArrayList<>()) | ||
} | ||
outputDictionary.get(digitSequence).add(word) | ||
} | ||
} | ||
|
||
def assetError = writeZipAsset(dictionariesOutputDir, dictionaryFile, outputDictionary, outputDictionaryExtension) | ||
if (assetError) { | ||
errorCount++ | ||
errorMsg += assetError | ||
} | ||
|
||
return [errorCount, errorMsg] | ||
} | ||
|
||
//////////////////// FILE I/O //////////////////// | ||
|
||
static byte[] compressDictionaryLine(String digitSequence, List<String> words) { | ||
return (digitSequence + words.join()).getBytes(StandardCharsets.UTF_8) | ||
} | ||
|
||
/** | ||
* Zipping the text files results in a smaller APK in comparison to the uncompressed text files. | ||
*/ | ||
static def writeZipAsset(dictionariesOutputDir, dictionaryFile, outputDictionary, outputDictionaryExtension) { | ||
def fileName = dictionaryFile.name.replaceFirst("\\.\\w+\$", "") | ||
def outputFile = new File(dictionariesOutputDir, "${fileName}.${outputDictionaryExtension}") | ||
|
||
try { | ||
def zipOutputStream = new ZipOutputStream(new FileOutputStream(outputFile)) | ||
zipOutputStream.putNextEntry(new ZipEntry("${fileName}.txt")) | ||
outputDictionary.each { digitSequence, words -> | ||
zipOutputStream.write(compressDictionaryLine(digitSequence, words)) | ||
} | ||
zipOutputStream.closeEntry() | ||
zipOutputStream.close() | ||
return "" | ||
} catch (Exception e) { | ||
return "Failed writing to '${outputFile.path}'. ${e.message}\n" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,70 @@ | ||
ext.getDictionaryProperties = { dictionariesDir, sizesDir -> | ||
fileTree(dir: dictionariesDir).getFiles().parallelStream().forEach {dictionary -> | ||
def hash = dictionary.exists() ? dictionary.text.digest("SHA-1") : "" | ||
def revision = dictionary.exists() ? exec("git log --pretty=tformat:%H -n 1 ${dictionary}") : "" | ||
def size = dictionary.exists() ? dictionary.length() : 0 | ||
def words = dictionary.exists() ? dictionary.text.split("\n").length : 0 | ||
|
||
new File(sizesDir, "${dictionary.getName()}.props.yml").text = "hash: ${hash}\nrevision: ${revision}\nsize: ${size}\nwords: ${words}" | ||
} | ||
} | ||
fileTree(dir: dictionariesDir).getFiles().parallelStream().forEach { dictionary -> | ||
def hash = dictionary.exists() ? dictionary.text.digest("SHA-1") : "" | ||
def revision = dictionary.exists() ? exec("git log --pretty=tformat:%H -n 1 ${dictionary}") : "" | ||
def size = dictionary.exists() ? dictionary.length() : 0 | ||
def words = dictionary.exists() ? dictionary.text.split("\n").length : 0 | ||
|
||
new File(sizesDir, "${dictionary.getName().replaceFirst("\\.\\w+\$", "")}.props.yml").text = "hash: ${hash}\nrevision: ${revision}\nsize: ${size}\nwords: ${words}" | ||
} | ||
} | ||
|
||
|
||
class Wrapper { | ||
static def getDictionaryLineData(String line, String delimiter) { | ||
String[] parts = line.split(delimiter, 2) | ||
String word = parts[0] | ||
String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : "" | ||
|
||
int frequency | ||
try { | ||
int partsElement = transcription.isEmpty() ? 1 : 2 | ||
frequency = (parts.length > partsElement ? parts[partsElement] : "0") as int | ||
} catch (Exception ignored) { | ||
frequency = -1 | ||
} | ||
|
||
return [word, transcription, frequency] | ||
} | ||
|
||
|
||
static def wordToDigitSequence(Locale locale, String word, HashMap<String, String> sounds, boolean isTranscribed) { | ||
String sequence = "" | ||
|
||
final String normalizedWord = isTranscribed ? word : word.toUpperCase(locale) | ||
String currentSound = "" | ||
|
||
for (int i = 0, end = normalizedWord.length() - 1; i <= end; i++) { | ||
char currentChar = normalizedWord.charAt(i) | ||
char nextChar = i < end ? normalizedWord.charAt(i + 1) : 0 | ||
int nextCharType = Character.getType(nextChar) | ||
|
||
currentSound += currentChar | ||
|
||
// charAt(i) returns "ΐ" as three separate characters, but they must be treated as one. | ||
if ( | ||
locale.getLanguage() == "el" | ||
&& (nextCharType == Character.NON_SPACING_MARK || nextCharType == Character.ENCLOSING_MARK || nextCharType == Character.COMBINING_SPACING_MARK) | ||
) { | ||
continue | ||
} | ||
|
||
if (!isTranscribed || i == end || Character.isUpperCase(nextChar)) { | ||
if (!sounds.containsKey(currentSound)) { | ||
throw new IllegalArgumentException("Sound or layout entry '${currentSound}' does not belong to the language sound list: ${sounds}.") | ||
} else { | ||
sequence += sounds.get(currentSound) | ||
currentSound = "" | ||
} | ||
} | ||
} | ||
|
||
if (sequence.isEmpty()) { | ||
throw new IllegalArgumentException("The word does not contain any valid sounds.") | ||
} | ||
|
||
return sequence | ||
} | ||
} | ||
|
||
ext.DictionaryTools = Wrapper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.