Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon #1917

Open
vikivivi opened this issue May 10, 2021 · 0 comments
Labels
dictionary Changes to the dictionary enhancement

Comments

@vikivivi
Copy link
Contributor

vikivivi commented May 10, 2021

Please find the shell script (bash) to auto generate en-GB_to_en-US and en-US_to_en-GB dictionaries using SCOWL VarCon.
Hope it is useful.

Total entries 16216: dictionary_custom_varcon_en-GB_to_en-US.txt
Total entries 15916: dictionary_custom_varcon_en-US_to_en-GB.txt

#!/bin/bash
#
# Author of this script: vikivivi
# This script version: 20210520.00
# This script will create en-GB_to_en-US and en-US_to_en-GB dictionaries using varcon.txt from wordlist project.
# See SCOWL (and friends):
# SCOWL is derived from many sources under a BSD compatible license.
# The combined work is freely available under a MIT-like license.
# http://wordlist.aspell.net/varcon/
# https://github.com/en-wl/wordlist/
#
# Note: Beware when converting en-US_to_en-GB.
# For example, in en-GB_to_en-US: "aunty->auntie".
# But when en-US_to_en-GB: it is not "auntie->aunty", it should remain as "auntie->auntie".

WORDLIST_VARCON="./varcon.txt"

DICT_FILE_en_GB_to_en_US="dictionary_custom_varcon_en-GB_to_en-US.txt"
DICT_FILE_en_US_to_en_GB="dictionary_custom_varcon_en-US_to_en-GB.txt"

func_download_varcon() {
	if [ ! -f "${WORDLIST_VARCON}" ]; then
		wget -O "${WORDLIST_VARCON}" "https://github.com/en-wl/wordlist/raw/master/varcon/varcon.txt"
		RETVAL=$?
		if [ "x${RETVAL}" != "x0" ]; then
			echo "ERROR: failed to download ${WORDLIST_VARCON}"
			exit 1
		fi
	fi
}

func_process_varcon_en_GB_to_en_US() {
	DICT_FILE_en_GB_to_en_US_unknown="dictionary_custom_varcon_en-GB_to_en-US_unknown.log"

	rm -f "${DICT_FILE_en_GB_to_en_US}"
	rm -f "${DICT_FILE_en_GB_to_en_US_unknown}"
	touch "${DICT_FILE_en_GB_to_en_US}"
	touch "${DICT_FILE_en_GB_to_en_US_unknown}"

	echo "INFO: Processing en_GB_to_en_US with varcon.txt..."
	declare -a STR_ARRAY
	declare -a WORD_ARRAY_en_GB
	declare -a WORD_ARRAY_en_US
	grep -v '^\s*$\|^\s*\#' < "${WORDLIST_VARCON}" | while IFS= read -r STR_WHOLE_LINE; do
		# Check number of forward slashes
		SLASH_FORWARD_FOUND=$(echo "${STR_WHOLE_LINE}" | grep -o '/' | wc -l)
		if [ "${SLASH_FORWARD_FOUND}" -lt 1 ]; then
			# If no forward slash is found, continue
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_GB_to_en_US_unknown}"
			continue
		fi

		# Clean up comment '|' and '#'
		STR_WHOLE_LINE_CLEAN=$(echo "${STR_WHOLE_LINE}" | sed 's/|.*//g' | sed 's/#.*//g' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')
		# Convert into array
		IFS='/' read -ra STR_ARRAY <<< "${STR_WHOLE_LINE_CLEAN}"

		STR_INDEX=0
		# Create empty array
		WORD_ARRAY_en_GB=()
		WORD_ARRAY_en_US=()
		for _STR_TMP in "${STR_ARRAY[@]}"; do
			# Convert "lang" delimiter from "space" to "comma"
			_LANG=$(echo "${_STR_TMP}" | sed -e 's/^[ \t]*//' | awk -F':' '{printf ",%s,", $1}' | tr " " ",")
			_WORD=$(echo "${_STR_TMP}" | awk -F':' '{printf $2}' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')

			# en_GB_to_en_US: Convert B and Bv into official A
			_LANG_CHECK_en_US=$(echo "${_LANG}" | grep ",A,")
			if [ "x${_LANG_CHECK_en_US}" != "x" ]; then
				WORD_ARRAY_en_US+=("${_WORD}")
			fi
			_LANG_CHECK_en_GB=$(echo "${_LANG}" | grep ",B,")
			_LANG_CHECK_en_GB_Bv=$(echo "${_LANG}" | grep ",Bv,")
			if [ "x${_LANG_CHECK_en_GB}" != "x" ] || [ "x${_LANG_CHECK_en_GB_Bv}" != "x" ]; then
				WORD_ARRAY_en_GB+=("${_WORD}")
			fi

			STR_ARRAY[${STR_INDEX}]="${_LANG} :${_WORD}"
			STR_INDEX=$((STR_INDEX + 1))
		done

		# If either en_GB or en_US is not found, continue
		if [ ${#WORD_ARRAY_en_GB[@]} -eq 0 ] || [ ${#WORD_ARRAY_en_US[@]} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_GB_to_en_US_unknown}"
			continue
		fi

		CONVERSION_en_GB_to_en_US_OK=0
		for WORD_en_GB in "${WORD_ARRAY_en_GB[@]}"; do
			# Matching en_GB_to_en_US
			if [ "x${WORD_en_GB}" != "x${WORD_ARRAY_en_US[0]}" ]; then
				# Ignore case distinctions for duplication check
				CHECK_DUPLICATE_ENTRY=$(grep -i -e "^${WORD_en_GB}->${WORD_ARRAY_en_US[0]}$" "${DICT_FILE_en_GB_to_en_US}")
				CHECK_CONFLICT_ENTRY=$(grep -i -e "^${WORD_ARRAY_en_US[0]}->${WORD_en_GB}$" "${DICT_FILE_en_GB_to_en_US}")
				if [ "${CHECK_DUPLICATE_ENTRY}" = "" ] && [ "${CHECK_CONFLICT_ENTRY}" = "" ]; then
					echo "${WORD_en_GB}->${WORD_ARRAY_en_US[0]}" >> "${DICT_FILE_en_GB_to_en_US}"
					CONVERSION_en_GB_to_en_US_OK=1
				fi
			fi
		done

		# If conversion is unsuccessful
		if [ ${CONVERSION_en_GB_to_en_US_OK} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_GB_to_en_US_unknown}"
		fi
	done
}

func_process_varcon_en_US_to_en_GB() {
	DICT_FILE_en_US_to_en_GB_unknown="dictionary_custom_varcon_en-US_to_en-GB_unknown.log"

	rm -f "${DICT_FILE_en_US_to_en_GB}"
	rm -f "${DICT_FILE_en_US_to_en_GB_unknown}"
	touch "${DICT_FILE_en_US_to_en_GB}"
	touch "${DICT_FILE_en_US_to_en_GB_unknown}"

	echo "INFO: Processing en_US_to_en_GB with varcon.txt..."
	declare -a STR_ARRAY
	declare -a WORD_ARRAY_en_GB
	declare -a WORD_ARRAY_en_US
	grep -v '^\s*$\|^\s*\#' < "${WORDLIST_VARCON}" | while IFS= read -r STR_WHOLE_LINE; do
		# Check number of forward slashes
		SLASH_FORWARD_FOUND=$(echo "${STR_WHOLE_LINE}" | grep -o '/' | wc -l)
		if [ "${SLASH_FORWARD_FOUND}" -lt 1 ]; then
			# If no forward slash is found, continue
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_US_to_en_GB_unknown}"
			continue
		fi

		# Clean up comment '|' and '#'
		STR_WHOLE_LINE_CLEAN=$(echo "${STR_WHOLE_LINE}" | sed 's/|.*//g' | sed 's/#.*//g' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')
		# Convert into array
		IFS='/' read -ra STR_ARRAY <<< "${STR_WHOLE_LINE_CLEAN}"

		STR_INDEX=0
		# Create empty array
		WORD_ARRAY_en_GB=()
		WORD_ARRAY_en_US=()
		for _STR_TMP in "${STR_ARRAY[@]}"; do
			# Convert "lang" delimiter from "space" to "comma"
			_LANG=$(echo "${_STR_TMP}" | sed -e 's/^[ \t]*//' | awk -F':' '{printf ",%s,", $1}' | tr " " ",")
			_WORD=$(echo "${_STR_TMP}" | awk -F':' '{printf $2}' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')

			# en_US_to_en_GB: Convert A and Av into official B
			_LANG_CHECK_en_GB=$(echo "${_LANG}" | grep ",B,")
			if [ "x${_LANG_CHECK_en_GB}" != "x" ]; then
				WORD_ARRAY_en_GB+=("${_WORD}")
			fi
			_LANG_CHECK_en_US=$(echo "${_LANG}" | grep ",A,")
			_LANG_CHECK_en_US_Av=$(echo "${_LANG}" | grep ",Av,")
			if [ "x${_LANG_CHECK_en_US}" != "x" ] || [ "x${_LANG_CHECK_en_US_Av}" != "x" ]; then
				WORD_ARRAY_en_US+=("${_WORD}")
			fi

			STR_ARRAY[${STR_INDEX}]="${_LANG} :${_WORD}"
			STR_INDEX=$((STR_INDEX + 1))
		done

		# If either en_GB or en_US is not found, continue
		if [ ${#WORD_ARRAY_en_GB[@]} -eq 0 ] || [ ${#WORD_ARRAY_en_US[@]} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_US_to_en_GB_unknown}"
			continue
		fi

		CONVERSION_en_US_to_en_GB_OK=0
		for WORD_en_US in "${WORD_ARRAY_en_US[@]}"; do
			# Matching en_US_to_en_GB
			if [ "x${WORD_en_US}" != "x${WORD_ARRAY_en_GB[0]}" ]; then
				# Ignore case distinctions for duplication check
				CHECK_DUPLICATE_ENTRY=$(grep -i -e "^${WORD_en_US}->${WORD_ARRAY_en_GB[0]}$" "${DICT_FILE_en_US_to_en_GB}")
				CHECK_CONFLICT_ENTRY=$(grep -i -e "^${WORD_ARRAY_en_GB[0]}->${WORD_en_US}$" "${DICT_FILE_en_US_to_en_GB}")
				if [ "${CHECK_DUPLICATE_ENTRY}" = "" ] && [ "${CHECK_CONFLICT_ENTRY}" = "" ]; then
					echo "${WORD_en_US}->${WORD_ARRAY_en_GB[0]}" >> "${DICT_FILE_en_US_to_en_GB}"
					CONVERSION_en_US_to_en_GB_OK=1
				fi
			fi
		done

		# If conversion is unsuccessful
		if [ ${CONVERSION_en_US_to_en_GB_OK} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_US_to_en_GB_unknown}"
		fi
	done
}

func_sort_dictionary() {
	DICT_FILE="$1"

	#wc -l "${DICT_FILE}"
	# Sort it or manually run in command line.
	LC_ALL=C sort -f -b -u -o "${DICT_FILE}" "${DICT_FILE}"
	#wc -l "${DICT_FILE}"
}

# Main program
func_download_varcon
func_process_varcon_en_GB_to_en_US
func_process_varcon_en_US_to_en_GB
func_sort_dictionary "${DICT_FILE_en_GB_to_en_US}"
func_sort_dictionary "${DICT_FILE_en_US_to_en_GB}"
@vikivivi vikivivi changed the title Script to auto generate en-GB to en-US dictionary using SCOWL VarCon Script to auto generate en-GB_to_en-US & en-US_to_en_GB dictionaries using SCOWL VarCon May 16, 2021
@vikivivi vikivivi changed the title Script to auto generate en-GB_to_en-US & en-US_to_en_GB dictionaries using SCOWL VarCon Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon May 16, 2021
@DimitriPapadopoulos DimitriPapadopoulos added enhancement dictionary Changes to the dictionary labels Sep 2, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
dictionary Changes to the dictionary enhancement
Projects
None yet
Development

No branches or pull requests

2 participants