You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Please find the shell script (bash) to auto generate en-GB_to_en-US and en-US_to_en-GB dictionaries using SCOWL VarCon.
Hope it is useful.
Total entries 16216: dictionary_custom_varcon_en-GB_to_en-US.txt
Total entries 15916: dictionary_custom_varcon_en-US_to_en-GB.txt
#!/bin/bash## Author of this script: vikivivi# This script version: 20210520.00# This script will create en-GB_to_en-US and en-US_to_en-GB dictionaries using varcon.txt from wordlist project.# See SCOWL (and friends):# SCOWL is derived from many sources under a BSD compatible license.# The combined work is freely available under a MIT-like license.# http://wordlist.aspell.net/varcon/# https://github.com/en-wl/wordlist/## Note: Beware when converting en-US_to_en-GB.# For example, in en-GB_to_en-US: "aunty->auntie".# But when en-US_to_en-GB: it is not "auntie->aunty", it should remain as "auntie->auntie".
WORDLIST_VARCON="./varcon.txt"
DICT_FILE_en_GB_to_en_US="dictionary_custom_varcon_en-GB_to_en-US.txt"
DICT_FILE_en_US_to_en_GB="dictionary_custom_varcon_en-US_to_en-GB.txt"func_download_varcon() {
if [ !-f"${WORDLIST_VARCON}" ];then
wget -O "${WORDLIST_VARCON}""https://github.com/en-wl/wordlist/raw/master/varcon/varcon.txt"
RETVAL=$?if [ "x${RETVAL}"!="x0" ];thenecho"ERROR: failed to download ${WORDLIST_VARCON}"exit 1
fifi
}
func_process_varcon_en_GB_to_en_US() {
DICT_FILE_en_GB_to_en_US_unknown="dictionary_custom_varcon_en-GB_to_en-US_unknown.log"
rm -f "${DICT_FILE_en_GB_to_en_US}"
rm -f "${DICT_FILE_en_GB_to_en_US_unknown}"
touch "${DICT_FILE_en_GB_to_en_US}"
touch "${DICT_FILE_en_GB_to_en_US_unknown}"echo"INFO: Processing en_GB_to_en_US with varcon.txt..."declare -a STR_ARRAY
declare -a WORD_ARRAY_en_GB
declare -a WORD_ARRAY_en_US
grep -v '^\s*$\|^\s*\#'<"${WORDLIST_VARCON}"|while IFS= read -r STR_WHOLE_LINE;do# Check number of forward slashes
SLASH_FORWARD_FOUND=$(echo "${STR_WHOLE_LINE}"| grep -o '/'| wc -l)if [ "${SLASH_FORWARD_FOUND}"-lt 1 ];then# If no forward slash is found, continueecho"${STR_WHOLE_LINE}">>"${DICT_FILE_en_GB_to_en_US_unknown}"continuefi# Clean up comment '|' and '#'
STR_WHOLE_LINE_CLEAN=$(echo "${STR_WHOLE_LINE}"| sed 's/|.*//g'| sed 's/#.*//g'| sed -e 's/^[ \t]*//'| sed -e 's/ *$//g')# Convert into array
IFS='/'read -ra STR_ARRAY <<<"${STR_WHOLE_LINE_CLEAN}"
STR_INDEX=0
# Create empty array
WORD_ARRAY_en_GB=()
WORD_ARRAY_en_US=()
for_STR_TMPin"${STR_ARRAY[@]}";do# Convert "lang" delimiter from "space" to "comma"
_LANG=$(echo "${_STR_TMP}"| sed -e 's/^[ \t]*//'| awk -F':''{printf ",%s,", $1}'| tr """,")
_WORD=$(echo "${_STR_TMP}"| awk -F':''{printf $2}'| sed -e 's/^[ \t]*//'| sed -e 's/ *$//g')# en_GB_to_en_US: Convert B and Bv into official A
_LANG_CHECK_en_US=$(echo "${_LANG}"| grep ",A,")if [ "x${_LANG_CHECK_en_US}"!="x" ];then
WORD_ARRAY_en_US+=("${_WORD}")
fi
_LANG_CHECK_en_GB=$(echo "${_LANG}"| grep ",B,")
_LANG_CHECK_en_GB_Bv=$(echo "${_LANG}"| grep ",Bv,")if [ "x${_LANG_CHECK_en_GB}"!="x" ] || [ "x${_LANG_CHECK_en_GB_Bv}"!="x" ];then
WORD_ARRAY_en_GB+=("${_WORD}")
fi
STR_ARRAY[${STR_INDEX}]="${_LANG} :${_WORD}"
STR_INDEX=$((STR_INDEX +1))done# If either en_GB or en_US is not found, continueif [ ${#WORD_ARRAY_en_GB[@]}-eq 0 ] || [ ${#WORD_ARRAY_en_US[@]}-eq 0 ];thenecho"${STR_WHOLE_LINE}">>"${DICT_FILE_en_GB_to_en_US_unknown}"continuefi
CONVERSION_en_GB_to_en_US_OK=0
forWORD_en_GBin"${WORD_ARRAY_en_GB[@]}";do# Matching en_GB_to_en_USif [ "x${WORD_en_GB}"!="x${WORD_ARRAY_en_US[0]}" ];then# Ignore case distinctions for duplication check
CHECK_DUPLICATE_ENTRY=$(grep -i -e "^${WORD_en_GB}->${WORD_ARRAY_en_US[0]}$""${DICT_FILE_en_GB_to_en_US}")
CHECK_CONFLICT_ENTRY=$(grep -i -e "^${WORD_ARRAY_en_US[0]}->${WORD_en_GB}$""${DICT_FILE_en_GB_to_en_US}")if [ "${CHECK_DUPLICATE_ENTRY}"="" ] && [ "${CHECK_CONFLICT_ENTRY}"="" ];thenecho"${WORD_en_GB}->${WORD_ARRAY_en_US[0]}">>"${DICT_FILE_en_GB_to_en_US}"
CONVERSION_en_GB_to_en_US_OK=1
fifidone# If conversion is unsuccessfulif [ ${CONVERSION_en_GB_to_en_US_OK}-eq 0 ];thenecho"${STR_WHOLE_LINE}">>"${DICT_FILE_en_GB_to_en_US_unknown}"fidone
}
func_process_varcon_en_US_to_en_GB() {
DICT_FILE_en_US_to_en_GB_unknown="dictionary_custom_varcon_en-US_to_en-GB_unknown.log"
rm -f "${DICT_FILE_en_US_to_en_GB}"
rm -f "${DICT_FILE_en_US_to_en_GB_unknown}"
touch "${DICT_FILE_en_US_to_en_GB}"
touch "${DICT_FILE_en_US_to_en_GB_unknown}"echo"INFO: Processing en_US_to_en_GB with varcon.txt..."declare -a STR_ARRAY
declare -a WORD_ARRAY_en_GB
declare -a WORD_ARRAY_en_US
grep -v '^\s*$\|^\s*\#'<"${WORDLIST_VARCON}"|while IFS= read -r STR_WHOLE_LINE;do# Check number of forward slashes
SLASH_FORWARD_FOUND=$(echo "${STR_WHOLE_LINE}"| grep -o '/'| wc -l)if [ "${SLASH_FORWARD_FOUND}"-lt 1 ];then# If no forward slash is found, continueecho"${STR_WHOLE_LINE}">>"${DICT_FILE_en_US_to_en_GB_unknown}"continuefi# Clean up comment '|' and '#'
STR_WHOLE_LINE_CLEAN=$(echo "${STR_WHOLE_LINE}"| sed 's/|.*//g'| sed 's/#.*//g'| sed -e 's/^[ \t]*//'| sed -e 's/ *$//g')# Convert into array
IFS='/'read -ra STR_ARRAY <<<"${STR_WHOLE_LINE_CLEAN}"
STR_INDEX=0
# Create empty array
WORD_ARRAY_en_GB=()
WORD_ARRAY_en_US=()
for_STR_TMPin"${STR_ARRAY[@]}";do# Convert "lang" delimiter from "space" to "comma"
_LANG=$(echo "${_STR_TMP}"| sed -e 's/^[ \t]*//'| awk -F':''{printf ",%s,", $1}'| tr """,")
_WORD=$(echo "${_STR_TMP}"| awk -F':''{printf $2}'| sed -e 's/^[ \t]*//'| sed -e 's/ *$//g')# en_US_to_en_GB: Convert A and Av into official B
_LANG_CHECK_en_GB=$(echo "${_LANG}"| grep ",B,")if [ "x${_LANG_CHECK_en_GB}"!="x" ];then
WORD_ARRAY_en_GB+=("${_WORD}")
fi
_LANG_CHECK_en_US=$(echo "${_LANG}"| grep ",A,")
_LANG_CHECK_en_US_Av=$(echo "${_LANG}"| grep ",Av,")if [ "x${_LANG_CHECK_en_US}"!="x" ] || [ "x${_LANG_CHECK_en_US_Av}"!="x" ];then
WORD_ARRAY_en_US+=("${_WORD}")
fi
STR_ARRAY[${STR_INDEX}]="${_LANG} :${_WORD}"
STR_INDEX=$((STR_INDEX +1))done# If either en_GB or en_US is not found, continueif [ ${#WORD_ARRAY_en_GB[@]}-eq 0 ] || [ ${#WORD_ARRAY_en_US[@]}-eq 0 ];thenecho"${STR_WHOLE_LINE}">>"${DICT_FILE_en_US_to_en_GB_unknown}"continuefi
CONVERSION_en_US_to_en_GB_OK=0
forWORD_en_USin"${WORD_ARRAY_en_US[@]}";do# Matching en_US_to_en_GBif [ "x${WORD_en_US}"!="x${WORD_ARRAY_en_GB[0]}" ];then# Ignore case distinctions for duplication check
CHECK_DUPLICATE_ENTRY=$(grep -i -e "^${WORD_en_US}->${WORD_ARRAY_en_GB[0]}$""${DICT_FILE_en_US_to_en_GB}")
CHECK_CONFLICT_ENTRY=$(grep -i -e "^${WORD_ARRAY_en_GB[0]}->${WORD_en_US}$""${DICT_FILE_en_US_to_en_GB}")if [ "${CHECK_DUPLICATE_ENTRY}"="" ] && [ "${CHECK_CONFLICT_ENTRY}"="" ];thenecho"${WORD_en_US}->${WORD_ARRAY_en_GB[0]}">>"${DICT_FILE_en_US_to_en_GB}"
CONVERSION_en_US_to_en_GB_OK=1
fifidone# If conversion is unsuccessfulif [ ${CONVERSION_en_US_to_en_GB_OK}-eq 0 ];thenecho"${STR_WHOLE_LINE}">>"${DICT_FILE_en_US_to_en_GB_unknown}"fidone
}
func_sort_dictionary() {
DICT_FILE="$1"#wc -l "${DICT_FILE}"# Sort it or manually run in command line.
LC_ALL=C sort -f -b -u -o "${DICT_FILE}""${DICT_FILE}"#wc -l "${DICT_FILE}"
}
# Main program
func_download_varcon
func_process_varcon_en_GB_to_en_US
func_process_varcon_en_US_to_en_GB
func_sort_dictionary "${DICT_FILE_en_GB_to_en_US}"
func_sort_dictionary "${DICT_FILE_en_US_to_en_GB}"
The text was updated successfully, but these errors were encountered:
vikivivi
changed the title
Script to auto generate en-GB to en-US dictionary using SCOWL VarCon
Script to auto generate en-GB_to_en-US & en-US_to_en_GB dictionaries using SCOWL VarCon
May 16, 2021
vikivivi
changed the title
Script to auto generate en-GB_to_en-US & en-US_to_en_GB dictionaries using SCOWL VarCon
Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon
May 16, 2021
Please find the shell script (bash) to auto generate en-GB_to_en-US and en-US_to_en-GB dictionaries using SCOWL VarCon.
Hope it is useful.
Total entries 16216: dictionary_custom_varcon_en-GB_to_en-US.txt
Total entries 15916: dictionary_custom_varcon_en-US_to_en-GB.txt
The text was updated successfully, but these errors were encountered: