Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon #1917

vikivivi · 2021-05-10T12:24:03Z

Please find the shell script (bash) to auto generate en-GB_to_en-US and en-US_to_en-GB dictionaries using SCOWL VarCon.
Hope it is useful.

Total entries 16216: dictionary_custom_varcon_en-GB_to_en-US.txt
Total entries 15916: dictionary_custom_varcon_en-US_to_en-GB.txt

#!/bin/bash
#
# Author of this script: vikivivi
# This script version: 20210520.00
# This script will create en-GB_to_en-US and en-US_to_en-GB dictionaries using varcon.txt from wordlist project.
# See SCOWL (and friends):
# SCOWL is derived from many sources under a BSD compatible license.
# The combined work is freely available under a MIT-like license.
# http://wordlist.aspell.net/varcon/
# https://github.com/en-wl/wordlist/
#
# Note: Beware when converting en-US_to_en-GB.
# For example, in en-GB_to_en-US: "aunty->auntie".
# But when en-US_to_en-GB: it is not "auntie->aunty", it should remain as "auntie->auntie".

WORDLIST_VARCON="./varcon.txt"

DICT_FILE_en_GB_to_en_US="dictionary_custom_varcon_en-GB_to_en-US.txt"
DICT_FILE_en_US_to_en_GB="dictionary_custom_varcon_en-US_to_en-GB.txt"

func_download_varcon() {
	if [ ! -f "${WORDLIST_VARCON}" ]; then
		wget -O "${WORDLIST_VARCON}" "https://github.com/en-wl/wordlist/raw/master/varcon/varcon.txt"
		RETVAL=$?
		if [ "x${RETVAL}" != "x0" ]; then
			echo "ERROR: failed to download ${WORDLIST_VARCON}"
			exit 1
		fi
	fi
}

func_process_varcon_en_GB_to_en_US() {
	DICT_FILE_en_GB_to_en_US_unknown="dictionary_custom_varcon_en-GB_to_en-US_unknown.log"

	rm -f "${DICT_FILE_en_GB_to_en_US}"
	rm -f "${DICT_FILE_en_GB_to_en_US_unknown}"
	touch "${DICT_FILE_en_GB_to_en_US}"
	touch "${DICT_FILE_en_GB_to_en_US_unknown}"

	echo "INFO: Processing en_GB_to_en_US with varcon.txt..."
	declare -a STR_ARRAY
	declare -a WORD_ARRAY_en_GB
	declare -a WORD_ARRAY_en_US
	grep -v '^\s*$\|^\s*\#' < "${WORDLIST_VARCON}" | while IFS= read -r STR_WHOLE_LINE; do
		# Check number of forward slashes
		SLASH_FORWARD_FOUND=$(echo "${STR_WHOLE_LINE}" | grep -o '/' | wc -l)
		if [ "${SLASH_FORWARD_FOUND}" -lt 1 ]; then
			# If no forward slash is found, continue
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_GB_to_en_US_unknown}"
			continue
		fi

		# Clean up comment '|' and '#'
		STR_WHOLE_LINE_CLEAN=$(echo "${STR_WHOLE_LINE}" | sed 's/|.*//g' | sed 's/#.*//g' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')
		# Convert into array
		IFS='/' read -ra STR_ARRAY <<< "${STR_WHOLE_LINE_CLEAN}"

		STR_INDEX=0
		# Create empty array
		WORD_ARRAY_en_GB=()
		WORD_ARRAY_en_US=()
		for _STR_TMP in "${STR_ARRAY[@]}"; do
			# Convert "lang" delimiter from "space" to "comma"
			_LANG=$(echo "${_STR_TMP}" | sed -e 's/^[ \t]*//' | awk -F':' '{printf ",%s,", $1}' | tr " " ",")
			_WORD=$(echo "${_STR_TMP}" | awk -F':' '{printf $2}' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')

			# en_GB_to_en_US: Convert B and Bv into official A
			_LANG_CHECK_en_US=$(echo "${_LANG}" | grep ",A,")
			if [ "x${_LANG_CHECK_en_US}" != "x" ]; then
				WORD_ARRAY_en_US+=("${_WORD}")
			fi
			_LANG_CHECK_en_GB=$(echo "${_LANG}" | grep ",B,")
			_LANG_CHECK_en_GB_Bv=$(echo "${_LANG}" | grep ",Bv,")
			if [ "x${_LANG_CHECK_en_GB}" != "x" ] || [ "x${_LANG_CHECK_en_GB_Bv}" != "x" ]; then
				WORD_ARRAY_en_GB+=("${_WORD}")
			fi

			STR_ARRAY[${STR_INDEX}]="${_LANG} :${_WORD}"
			STR_INDEX=$((STR_INDEX + 1))
		done

		# If either en_GB or en_US is not found, continue
		if [ ${#WORD_ARRAY_en_GB[@]} -eq 0 ] || [ ${#WORD_ARRAY_en_US[@]} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_GB_to_en_US_unknown}"
			continue
		fi

		CONVERSION_en_GB_to_en_US_OK=0
		for WORD_en_GB in "${WORD_ARRAY_en_GB[@]}"; do
			# Matching en_GB_to_en_US
			if [ "x${WORD_en_GB}" != "x${WORD_ARRAY_en_US[0]}" ]; then
				# Ignore case distinctions for duplication check
				CHECK_DUPLICATE_ENTRY=$(grep -i -e "^${WORD_en_GB}->${WORD_ARRAY_en_US[0]}$" "${DICT_FILE_en_GB_to_en_US}")
				CHECK_CONFLICT_ENTRY=$(grep -i -e "^${WORD_ARRAY_en_US[0]}->${WORD_en_GB}$" "${DICT_FILE_en_GB_to_en_US}")
				if [ "${CHECK_DUPLICATE_ENTRY}" = "" ] && [ "${CHECK_CONFLICT_ENTRY}" = "" ]; then
					echo "${WORD_en_GB}->${WORD_ARRAY_en_US[0]}" >> "${DICT_FILE_en_GB_to_en_US}"
					CONVERSION_en_GB_to_en_US_OK=1
				fi
			fi
		done

		# If conversion is unsuccessful
		if [ ${CONVERSION_en_GB_to_en_US_OK} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_GB_to_en_US_unknown}"
		fi
	done
}

func_process_varcon_en_US_to_en_GB() {
	DICT_FILE_en_US_to_en_GB_unknown="dictionary_custom_varcon_en-US_to_en-GB_unknown.log"

	rm -f "${DICT_FILE_en_US_to_en_GB}"
	rm -f "${DICT_FILE_en_US_to_en_GB_unknown}"
	touch "${DICT_FILE_en_US_to_en_GB}"
	touch "${DICT_FILE_en_US_to_en_GB_unknown}"

	echo "INFO: Processing en_US_to_en_GB with varcon.txt..."
	declare -a STR_ARRAY
	declare -a WORD_ARRAY_en_GB
	declare -a WORD_ARRAY_en_US
	grep -v '^\s*$\|^\s*\#' < "${WORDLIST_VARCON}" | while IFS= read -r STR_WHOLE_LINE; do
		# Check number of forward slashes
		SLASH_FORWARD_FOUND=$(echo "${STR_WHOLE_LINE}" | grep -o '/' | wc -l)
		if [ "${SLASH_FORWARD_FOUND}" -lt 1 ]; then
			# If no forward slash is found, continue
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_US_to_en_GB_unknown}"
			continue
		fi

		# Clean up comment '|' and '#'
		STR_WHOLE_LINE_CLEAN=$(echo "${STR_WHOLE_LINE}" | sed 's/|.*//g' | sed 's/#.*//g' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')
		# Convert into array
		IFS='/' read -ra STR_ARRAY <<< "${STR_WHOLE_LINE_CLEAN}"

		STR_INDEX=0
		# Create empty array
		WORD_ARRAY_en_GB=()
		WORD_ARRAY_en_US=()
		for _STR_TMP in "${STR_ARRAY[@]}"; do
			# Convert "lang" delimiter from "space" to "comma"
			_LANG=$(echo "${_STR_TMP}" | sed -e 's/^[ \t]*//' | awk -F':' '{printf ",%s,", $1}' | tr " " ",")
			_WORD=$(echo "${_STR_TMP}" | awk -F':' '{printf $2}' | sed -e 's/^[ \t]*//' | sed -e 's/ *$//g')

			# en_US_to_en_GB: Convert A and Av into official B
			_LANG_CHECK_en_GB=$(echo "${_LANG}" | grep ",B,")
			if [ "x${_LANG_CHECK_en_GB}" != "x" ]; then
				WORD_ARRAY_en_GB+=("${_WORD}")
			fi
			_LANG_CHECK_en_US=$(echo "${_LANG}" | grep ",A,")
			_LANG_CHECK_en_US_Av=$(echo "${_LANG}" | grep ",Av,")
			if [ "x${_LANG_CHECK_en_US}" != "x" ] || [ "x${_LANG_CHECK_en_US_Av}" != "x" ]; then
				WORD_ARRAY_en_US+=("${_WORD}")
			fi

			STR_ARRAY[${STR_INDEX}]="${_LANG} :${_WORD}"
			STR_INDEX=$((STR_INDEX + 1))
		done

		# If either en_GB or en_US is not found, continue
		if [ ${#WORD_ARRAY_en_GB[@]} -eq 0 ] || [ ${#WORD_ARRAY_en_US[@]} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_US_to_en_GB_unknown}"
			continue
		fi

		CONVERSION_en_US_to_en_GB_OK=0
		for WORD_en_US in "${WORD_ARRAY_en_US[@]}"; do
			# Matching en_US_to_en_GB
			if [ "x${WORD_en_US}" != "x${WORD_ARRAY_en_GB[0]}" ]; then
				# Ignore case distinctions for duplication check
				CHECK_DUPLICATE_ENTRY=$(grep -i -e "^${WORD_en_US}->${WORD_ARRAY_en_GB[0]}$" "${DICT_FILE_en_US_to_en_GB}")
				CHECK_CONFLICT_ENTRY=$(grep -i -e "^${WORD_ARRAY_en_GB[0]}->${WORD_en_US}$" "${DICT_FILE_en_US_to_en_GB}")
				if [ "${CHECK_DUPLICATE_ENTRY}" = "" ] && [ "${CHECK_CONFLICT_ENTRY}" = "" ]; then
					echo "${WORD_en_US}->${WORD_ARRAY_en_GB[0]}" >> "${DICT_FILE_en_US_to_en_GB}"
					CONVERSION_en_US_to_en_GB_OK=1
				fi
			fi
		done

		# If conversion is unsuccessful
		if [ ${CONVERSION_en_US_to_en_GB_OK} -eq 0 ]; then
			echo "${STR_WHOLE_LINE}" >> "${DICT_FILE_en_US_to_en_GB_unknown}"
		fi
	done
}

func_sort_dictionary() {
	DICT_FILE="$1"

	#wc -l "${DICT_FILE}"
	# Sort it or manually run in command line.
	LC_ALL=C sort -f -b -u -o "${DICT_FILE}" "${DICT_FILE}"
	#wc -l "${DICT_FILE}"
}

# Main program
func_download_varcon
func_process_varcon_en_GB_to_en_US
func_process_varcon_en_US_to_en_GB
func_sort_dictionary "${DICT_FILE_en_GB_to_en_US}"
func_sort_dictionary "${DICT_FILE_en_US_to_en_GB}"

This was referenced May 10, 2021

More potential source dictionaries #1614

Open

en-GB to en-US dict: add licenced->licensed #1869

Closed

vikivivi changed the title ~~Script to auto generate en-GB to en-US dictionary using SCOWL VarCon~~ Script to auto generate en-GB_to_en-US & en-US_to_en_GB dictionaries using SCOWL VarCon May 16, 2021

vikivivi changed the title ~~Script to auto generate en-GB_to_en-US & en-US_to_en_GB dictionaries using SCOWL VarCon~~ Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon May 16, 2021

vikivivi mentioned this issue May 16, 2021

Consider ways to automate en_US->en_GB dictionary corrections #1468

Open

vikivivi mentioned this issue Sep 2, 2023

Use SCOWL's varcon.txt for en_GB → en_US conversion #3061

Closed

DimitriPapadopoulos added enhancement dictionary Changes to the dictionary labels Sep 2, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon #1917

Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon #1917

vikivivi commented May 10, 2021 •

edited

Loading

Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon #1917

Script to auto generate en-GB_to_en-US & en-US_to_en-GB dictionaries using SCOWL VarCon #1917

Comments

vikivivi commented May 10, 2021 • edited Loading

vikivivi commented May 10, 2021 •

edited

Loading