Skip to content

Commit

Permalink
new dictionary format that supports syllabaries
Browse files Browse the repository at this point in the history
  • Loading branch information
sspanak committed Oct 31, 2024
1 parent 56b3556 commit a6e84c3
Show file tree
Hide file tree
Showing 65 changed files with 756 additions and 342 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
download/* binary
6 changes: 5 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@ jobs:
uses: gradle/gradle-build-action@v3

# validate and build
- name: Validate hELP
run: ./gradlew convertHelp
- name: Validate Downloads
run: ./scripts/validate-downloads.sh app/languages/dictionaries download
- name: Validate Dictionaries
run: ./gradlew validateLanguages
- name: Build Languages
run: ./gradlew copyDefinitions copyDictionaries writeDictionaryProperties
run: ./gradlew copyDefinitions buildDictionaries writeDictionaryProperties
- name: Lint
run: ./gradlew lint # this actually runs mergeResources, so it must come after the dictionary tasks
- name: Build all APK variants
Expand Down
97 changes: 97 additions & 0 deletions app/build-dictionary.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import java.nio.charset.StandardCharsets
import java.util.zip.ZipEntry
import java.util.zip.ZipOutputStream

apply from: 'dictionary-tools.gradle'

ext.convertDictionaries = { definitionsInputDir, dictionariesInputDir, dictionariesOutputDir, dictionariesDownloadDir ->
int errorCount = 0

def errorStream = fileTree(dir: definitionsInputDir).getFiles().parallelStream().map { definition ->
def (_, sounds, __, locale, dictionaryFile, langFileErrorCount, langFileErrorMsg) = parseLanguageDefintion(definition, dictionariesInputDir)
errorCount += langFileErrorCount
if (!langFileErrorMsg.isEmpty()) {
return langFileErrorMsg
}

def (conversionErrorCount, conversionErrorMessages) = convertDictionary(dictionaryFile, dictionariesOutputDir, DICTIONARY_OUTPUT_EXTENSION, sounds, locale, MAX_ERRORS, CSV_DELIMITER)
errorCount += conversionErrorCount
if (!conversionErrorMessages.isEmpty()) {
return conversionErrorMessages
}

return ""
}

String errorsMsg = errorStream.reduce("", String::concat)
if (errorsMsg) {
throw new GradleException(errorsMsg)
}
}


// this cannot be static, because DictionaryTools will not be visible
def convertDictionary(File dictionaryFile, String dictionariesOutputDir, String outputDictionaryExtension, HashMap<String, String> sounds, Locale locale, int maxErrors, String csvDelimiter) {
int errorCount = 0
String errorMsg = ''

List<String> fileContents = dictionaryFile.readLines()
LinkedHashMap<String, ArrayList<String>> outputDictionary = new LinkedHashMap<>()

for (int lineNumber = 1; lineNumber <= fileContents.size() && errorCount < maxErrors; lineNumber++) {
String line = fileContents.get(lineNumber - 1)

def (word, transcription) = DictionaryTools.getDictionaryLineData(line, csvDelimiter)

String digitSequence = ""
try {
def transcribedWord = transcription.isEmpty() ? word : transcription
digitSequence = DictionaryTools.wordToDigitSequence(locale, transcribedWord, sounds, !transcription.isEmpty())
} catch (IllegalArgumentException e) {
errorCount++
errorMsg += "Dictionary '${dictionaryFile.name}' is invalid. Failed generating digit sequence for word '${word}' on line ${lineNumber}. ${e.message}\n"
}

if (errorCount == 0) {
if (!outputDictionary.containsKey(digitSequence)) {
outputDictionary.put(digitSequence, new ArrayList<>())
}
outputDictionary.get(digitSequence).add(word)
}
}

def assetError = writeZipAsset(dictionariesOutputDir, dictionaryFile, outputDictionary, outputDictionaryExtension)
if (assetError) {
errorCount++
errorMsg += assetError
}

return [errorCount, errorMsg]
}

//////////////////// FILE I/O ////////////////////

static byte[] compressDictionaryLine(String digitSequence, List<String> words) {
return (digitSequence + words.join()).getBytes(StandardCharsets.UTF_8)
}

/**
* Zipping the text files results in a smaller APK in comparison to the uncompressed text files.
*/
static def writeZipAsset(dictionariesOutputDir, dictionaryFile, outputDictionary, outputDictionaryExtension) {
def fileName = dictionaryFile.name.replaceFirst("\\.\\w+\$", "")
def outputFile = new File(dictionariesOutputDir, "${fileName}.${outputDictionaryExtension}")

try {
def zipOutputStream = new ZipOutputStream(new FileOutputStream(outputFile))
zipOutputStream.putNextEntry(new ZipEntry("${fileName}.txt"))
outputDictionary.each { digitSequence, words ->
zipOutputStream.write(compressDictionaryLine(digitSequence, words))
}
zipOutputStream.closeEntry()
zipOutputStream.close()
return ""
} catch (Exception e) {
return "Failed writing to '${outputFile.path}'. ${e.message}\n"
}
}
57 changes: 41 additions & 16 deletions app/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@ plugins {
}

apply from: 'constants.gradle'
apply from: 'dictionary-tools.gradle'
apply from: 'help-tools.gradle'
apply from: 'build-dictionary.gradle'
apply from: 'validate-languages.gradle'
apply from: 'help-tools.gradle'
apply from: 'version-tools.gradle'


tasks.register('copyDefinitions', Copy) {
from LANGUAGES_INPUT_DIR
include '**/*.yml'
into LANGUAGES_OUTPUT_DIR
}


tasks.register('validateLanguages') {
inputs.dir LANGUAGES_INPUT_DIR
outputs.dir LANGUAGE_VALIDATION_DIR
Expand All @@ -18,19 +25,30 @@ tasks.register('validateLanguages') {
}
}

tasks.register('copyDefinitions', Copy) {
from LANGUAGES_INPUT_DIR
include '**/*.yml'
into LANGUAGES_OUTPUT_DIR

tasks.register('buildDictionaries') {
inputs.dir DICTIONARIES_INPUT_DIR
outputs.dir DICTIONARIES_OUTPUT_DIR

dependsOn validateLanguages
mustRunAfter validateLanguages

doLast {
convertDictionaries(DEFINITIONS_INPUT_DIR, DICTIONARIES_INPUT_DIR, DICTIONARIES_OUTPUT_DIR, DICTIONARIES_DOWNLOAD_DIR)
}
}

tasks.register('copyDictionaries', Copy) {
from DICTIONARIES_INPUT_DIR
include '**/*.csv'
include '**/*.txt'
into DICTIONARIES_OUTPUT_DIR

tasks.register('generateDownloads', Copy) {
from DICTIONARIES_OUTPUT_DIR
include "**/*.$DICTIONARY_OUTPUT_EXTENSION"
into DICTIONARIES_DOWNLOAD_DIR

dependsOn buildDictionaries
mustRunAfter buildDictionaries
}


tasks.register('convertHelp') {
inputs.dir HELP_MARKDOWN_DIR
outputs.dir HELP_HTML_DIR
Expand Down Expand Up @@ -58,6 +76,7 @@ tasks.register('updateManifest') {
clean {
delete LANGUAGES_OUTPUT_DIR
delete DICTIONARIES_OUTPUT_DIR
delete DICTIONARIES_DOWNLOAD_DIR
delete HELP_HTML_DIR
}

Expand All @@ -84,10 +103,12 @@ android {
}
buildTypes {
debug {
buildConfigField 'String', 'DICTIONARY_EXTENSION', "\"${DICTIONARY_OUTPUT_EXTENSION}\""
buildConfigField 'String', 'VERSION_FULL', "\"${getVersionString('debug')}\""
}

release {
buildConfigField 'String', 'DICTIONARY_EXTENSION', "\"${DICTIONARY_OUTPUT_EXTENSION}\""
buildConfigField 'String', 'VERSION_FULL', "\"${getVersionString('release')}\""

debuggable false
Expand Down Expand Up @@ -124,15 +145,19 @@ android {
].each { taskName ->
try {
tasks.named(taskName)?.configure {
dependsOn(validateLanguages, copyDefinitions, copyDictionaries, writeDictionaryProperties, convertHelp)
dependsOn(copyDefinitions, writeDictionaryProperties, convertHelp, validateLanguages, buildDictionaries)
}

if (taskName.toLowerCase().contains("full")) {
tasks.named(taskName)?.configure {dependsOn(buildDictionaries) }
}
} catch (UnknownTaskException ignored) {}
}

assembleLiteDebug.finalizedBy(updateManifest)
assembleFullDebug.finalizedBy(updateManifest)
assembleLiteRelease.finalizedBy(updateManifest)
assembleFullRelease.finalizedBy(updateManifest)
assembleLiteDebug.finalizedBy(generateDownloads, updateManifest)
assembleFullDebug.finalizedBy(generateDownloads, updateManifest)
assembleLiteRelease.finalizedBy(generateDownloads, updateManifest)
assembleFullRelease.finalizedBy(generateDownloads, updateManifest)

variant.outputs.configureEach {
def suffix = variant.flavorName == 'full' ? '-full' : ''
Expand Down
12 changes: 8 additions & 4 deletions app/constants.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@ ext.PACKAGE_NAME = "io.github.sspanak.${APP_NAME}"
ext.LANGUAGES_DIR_NAME = 'languages'
ext.DEFINITIONS_DIR_NAME = 'definitions'
ext.DICTIONARIES_DIR_NAME = 'dictionaries'
ext.DICTIONARIES_DOWNLOAD_DIR_NAME = 'download'
ext.DICTIONARY_SIZES_DIR_NAME = 'dictionary-sizes'

def ROOT_DIR = "${project.rootDir}/app"
def MAIN_ASSETS_DIR = "${ROOT_DIR}/src/main/assets"
def FULL_VERSION_ASSETS_DIR = "${ROOT_DIR}/src/full/assets"
def APP_ROOT_DIR = "${project.rootDir}/app"
def MAIN_ASSETS_DIR = "${APP_ROOT_DIR}/src/main/assets"
def FULL_VERSION_ASSETS_DIR = "${APP_ROOT_DIR}/src/full/assets"

ext.HELP_MARKDOWN_DIR = "${project.rootDir}/docs/help"
ext.HELP_HTML_DIR = "${MAIN_ASSETS_DIR}/help"

ext.LANGUAGES_INPUT_DIR = "${ROOT_DIR}/${LANGUAGES_DIR_NAME}"
ext.LANGUAGES_INPUT_DIR = "${APP_ROOT_DIR}/${LANGUAGES_DIR_NAME}"
ext.DEFINITIONS_INPUT_DIR = "${LANGUAGES_INPUT_DIR}/${DEFINITIONS_DIR_NAME}"
ext.DICTIONARIES_INPUT_DIR = "${LANGUAGES_INPUT_DIR}/${DICTIONARIES_DIR_NAME}"

ext.DICTIONARIES_DOWNLOAD_DIR = "${project.rootDir}/${DICTIONARIES_DOWNLOAD_DIR_NAME}"

ext.LANGUAGES_OUTPUT_DIR = "${MAIN_ASSETS_DIR}/${LANGUAGES_DIR_NAME}"
ext.DEFINITIONS_OUTPUT_DIR = "${LANGUAGES_OUTPUT_DIR}/${DEFINITIONS_DIR_NAME}"
ext.DICTIONARY_META_OUTPUT_DIR = "${LANGUAGES_OUTPUT_DIR}/${DICTIONARIES_DIR_NAME}"
Expand All @@ -25,5 +28,6 @@ ext.DICTIONARIES_OUTPUT_DIR = "${FULL_VERSION_ASSETS_DIR}/${LANGUAGES_DIR_NAME}/
ext.LANGUAGE_VALIDATION_DIR = layout.buildDirectory.dir("langValidation")

ext.CSV_DELIMITER = ' ' // TAB
ext.DICTIONARY_OUTPUT_EXTENSION = 'zip'
ext.MAX_WORD_FREQUENCY = 255
ext.MAX_ERRORS = 50
78 changes: 69 additions & 9 deletions app/dictionary-tools.gradle
Original file line number Diff line number Diff line change
@@ -1,10 +1,70 @@
ext.getDictionaryProperties = { dictionariesDir, sizesDir ->
fileTree(dir: dictionariesDir).getFiles().parallelStream().forEach {dictionary ->
def hash = dictionary.exists() ? dictionary.text.digest("SHA-1") : ""
def revision = dictionary.exists() ? exec("git log --pretty=tformat:%H -n 1 ${dictionary}") : ""
def size = dictionary.exists() ? dictionary.length() : 0
def words = dictionary.exists() ? dictionary.text.split("\n").length : 0

new File(sizesDir, "${dictionary.getName()}.props.yml").text = "hash: ${hash}\nrevision: ${revision}\nsize: ${size}\nwords: ${words}"
}
}
fileTree(dir: dictionariesDir).getFiles().parallelStream().forEach { dictionary ->
def hash = dictionary.exists() ? dictionary.text.digest("SHA-1") : ""
def revision = dictionary.exists() ? exec("git log --pretty=tformat:%H -n 1 ${dictionary}") : ""
def size = dictionary.exists() ? dictionary.length() : 0
def words = dictionary.exists() ? dictionary.text.split("\n").length : 0

new File(sizesDir, "${dictionary.getName().replaceFirst("\\.\\w+\$", "")}.props.yml").text = "hash: ${hash}\nrevision: ${revision}\nsize: ${size}\nwords: ${words}"
}
}


class Wrapper {
static def getDictionaryLineData(String line, String delimiter) {
String[] parts = line.split(delimiter, 2)
String word = parts[0]
String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""

int frequency
try {
int partsElement = transcription.isEmpty() ? 1 : 2
frequency = (parts.length > partsElement ? parts[partsElement] : "0") as int
} catch (Exception ignored) {
frequency = -1
}

return [word, transcription, frequency]
}


static def wordToDigitSequence(Locale locale, String word, HashMap<String, String> sounds, boolean isTranscribed) {
String sequence = ""

final String normalizedWord = isTranscribed ? word : word.toUpperCase(locale)
String currentSound = ""

for (int i = 0, end = normalizedWord.length() - 1; i <= end; i++) {
char currentChar = normalizedWord.charAt(i)
char nextChar = i < end ? normalizedWord.charAt(i + 1) : 0
int nextCharType = Character.getType(nextChar)

currentSound += currentChar

// charAt(i) returns "ΐ" as three separate characters, but they must be treated as one.
if (
locale.getLanguage() == "el"
&& (nextCharType == Character.NON_SPACING_MARK || nextCharType == Character.ENCLOSING_MARK || nextCharType == Character.COMBINING_SPACING_MARK)
) {
continue
}

if (!isTranscribed || i == end || Character.isUpperCase(nextChar)) {
if (!sounds.containsKey(currentSound)) {
throw new IllegalArgumentException("Sound or layout entry '${currentSound}' does not belong to the language sound list: ${sounds}.")
} else {
sequence += sounds.get(currentSound)
currentSound = ""
}
}
}

if (sequence.isEmpty()) {
throw new IllegalArgumentException("The word does not contain any valid sounds.")
}

return sequence
}
}

ext.DictionaryTools = Wrapper
4 changes: 2 additions & 2 deletions app/src/main/AndroidManifest.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:tools="http://schemas.android.com/tools"
android:versionCode="774"
android:versionName="40.0"
android:versionCode="775"
android:versionName="40.1"
xmlns:android="http://schemas.android.com/apk/res/android">

<uses-permission android:name="android.permission.POST_NOTIFICATIONS"/> <!-- allows displaying notifications on Android >= 13 -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public static NaturalLanguage getLanguage(@NonNull Context context, String line)
return null;
}

String[] parts = WordFile.splitLine(line);
String[] parts = WordFile.getLineData(line);
if (parts == null || parts.length < 2) {
return null;
}
Expand All @@ -79,7 +79,7 @@ public static NaturalLanguage getLanguage(@NonNull Context context, String line)
}

@NonNull public static String getWord(String line) {
String[] parts = WordFile.splitLine(line);
String[] parts = WordFile.getLineData(line);
return parts != null && parts.length > 0 ? parts[0] : "";
}
}
12 changes: 12 additions & 0 deletions app/src/main/java/io/github/sspanak/tt9/db/entities/Word.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import androidx.annotation.NonNull;

import java.util.ArrayList;

public class Word {
public int frequency;
public int position;
Expand All @@ -15,4 +17,14 @@ public static Word create(@NonNull String word, int frequency, int position) {

return w;
}

public static ArrayList<Word> create(WordFileLine line, int position) {
final int wordsCount = line.words.size();
ArrayList<Word> words = new ArrayList<>(wordsCount);
for (int i = 0; i < wordsCount; i++) {
words.add(create(line.words.get(i), wordsCount - i, position + i));
}

return words;
}
}
Loading

0 comments on commit a6e84c3

Please sign in to comment.