prepare-release

#!/bin/bash
usage(){
	fmt <<EOF
DESCRIPTION
	Calculate work word count, insert release date if none set, update modified date, and check for various common errors.

USAGE
	build [-v,--verbose] [-w,--no-word-count] [-d,--no-dates] [-r,--no-revision] DIRECTORY [DIRECTORY...]
		DIRECTORY is the source directory, which must contain DIRECTORY/src/.
		With the --no-word-count option, don't perform a word count calculation.
		With the --no-dates option, don't change the release and modified dates.
		With the --no-revision option, don't increment the revision number.
EOF
	exit
}
die(){ printf "Error: ${1}\n" 1>&2; exit 1; }
require(){ command -v $1 > /dev/null 2>&1 || { suggestion=""; if [ ! -z "$2" ]; then suggestion=" $2"; fi; die "$1 is not installed.${suggestion}"; } }
if [ $# -eq 1 ]; then if [ "$1" = "--help" -o "$1" = "-h" ]; then usage; fi fi
#End boilerplate

timestamp="$(date --utc +%s)"
isoTimestamp="$(date -d @${timestamp} --utc +"%Y-%m-%dT%H:%M:%SZ")"
friendlyTimestamp="$(date -d @${timestamp} --utc +"%B %e, %Y, %l:%M <abbr class=\"time eoc\">%p</abbr>" | sed --regexp-extended "s/\s+/ /g" | sed "s/>AM/>a.m./" | sed "s/>PM/>p.m./")"
scriptDir="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
wordcountPath="${scriptDir}/word-count"
readingEasePath="${scriptDir}/reading-ease"
unusedSelectorPath="${scriptDir}/find-unused-selectors"
ordinalPath="${scriptDir}/ordinal"
contentOpfFilePath="${ebookRoot}src/epub/content.opf"
colophonPath="${ebookRoot}src/epub/text/colophon.xhtml"
dirs=""
verbose="false"
doWordCount="true"
doDates="true"
doRevision="true"

while [ $# -gt 0 ]
do
	case "$1" in
		-v|--verbose)
			verbose="true"
		;;
		-w|--no-word-count)
			doWordCount="false"
		;;
		-d|--no-dates)
			doDates="false"
		;;
		-r|--no-revision)
			doRevision="false"
		;;
		*)
			dirs=$(printf "%s\n%s" "${dirs}" "$1")
		;;
	esac
	shift
done

printf "%s\n" "${dirs}" | while IFS= read -r i;
do
	if [ "${i}" = "" ]; then
		continue
	fi

	srcDir="$(realpath "${i%/}")"

	if [ ! -d "${srcDir}/src" ]; then
		die "${srcDir} doesn't look like a Standard Ebook source directory."
	fi

	if [ "${verbose}" = "true" ]; then
		printf "Preparing %s ...\n" "${srcDir}"
	fi

	if [ "${doWordCount}" = "true" ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\tUpdating word count ..."
		fi

		wordCount=$("${wordcountPath}" -x "${srcDir}")
		sed --in-place --regexp-extended "s|<meta property=\"se:word-count\">[^<]*</meta>|<meta property=\"se:word-count\">${wordCount}</meta>|g" "${srcDir}/src/epub/content.opf" > /dev/null 2>&1

		if [ "${verbose}" = "true" ]; then
			printf " Done.\n"
		fi


		if [ "${verbose}" = "true" ]; then
			printf "\tUpdating reading ease ..."
		fi

		ease=$("${readingEasePath}" "${srcDir}")
		sed --in-place --regexp-extended "s|<meta property=\"se:reading-ease.flesch\">[^<]*</meta>|<meta property=\"se:reading-ease.flesch\">${ease}</meta>|g" "${srcDir}/src/epub/content.opf" > /dev/null 2>&1

		if [ "${verbose}" = "true" ]; then
			printf " Done.\n"
		fi
	fi

	if [ "${doRevision}" = "true" ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\tUpdating revision number ..."
		fi

		revision=$(grep -E "<meta property=\"se:revision-number\">[0-9]+</meta>" "${srcDir}/src/epub/content.opf" | grep --only-matching -E "[0-9]+")
		revision=$((revision + 1))

		sed --in-place --regexp-extended "s|<meta property=\"se:revision-number\">[^<]+</meta>|<meta property=\"se:revision-number\">${revision}</meta>|g" "${srcDir}/src/epub/content.opf" > /dev/null 2>&1

		if [ "${revision}" != "1" ]; then
			ordinal=$("${ordinalPath}" "${revision}")

			#Are we moving from the first edition to the nth edition?
			grep --quiet -E "<p>This is the first edition of this ebook.<br/>" "${srcDir}/src/epub/text/colophon.xhtml"
			if [ $? -eq 0 ]; then
				sed --in-place --regexp-extended "s|This edition was released on<br/>|The first edition was released on<br/>|" "${srcDir}/src/epub/text/colophon.xhtml"
				sed --in-place --regexp-extended "s|<p>This is the first edition of this ebook.<br/>|<p>This is the <span class=\"revision-number\">${ordinal}</span> edition of this ebook.<br/>\n\t\t\tThis edition was released on<br/>\n\t\t\t<span class=\"revision-date\">${friendlyTimestamp}</span><br/>|" "${srcDir}/src/epub/text/colophon.xhtml"
			else
				sed --in-place --regexp-extended "s|<span class=\"revision-number\">[^<]+</span>|<span class=\"revision-number\">${ordinal}</span>|" "${srcDir}/src/epub/text/colophon.xhtml"
			fi
		fi


		if [ "${verbose}" = "true" ]; then
			printf " Done.\n"
		fi
	fi

	if [ "${doDates}" = "true" ]; then
		grep --quiet "1900-01-01T00:00:00Z" "${srcDir}/src/epub/content.opf"
		if [ $? -eq 0 ]; then
			if [ "${verbose}" = "true" ]; then
				printf "\tSetting release date ..."
			fi

			sed --in-place --regexp-extended "s|<dc:date>[^<]*</dc:date>|<dc:date>${isoTimestamp}</dc:date>|g" "${srcDir}/src/epub/content.opf" > /dev/null 2>&1
			sed --in-place --regexp-extended "s|<span class=\"release-date\">.+?</span>|<span class=\"release-date\">${friendlyTimestamp}</span>|g" "${srcDir}/src/epub/text/colophon.xhtml" > /dev/null 2>&1

			if [ "${verbose}" = "true" ]; then
				printf " Done.\n"
			fi
		fi

		if [ "${verbose}" = "true" ]; then
			printf "\tSetting modified date ..."
		fi

		sed --in-place --regexp-extended "s|<meta property=\"dcterms:modified\">[^<]*</meta>|<meta property=\"dcterms:modified\">${isoTimestamp}</meta>|g" "${srcDir}/src/epub/content.opf" > /dev/null 2>&1
		sed --in-place --regexp-extended "s|<span class=\"revision-date\">.+?</span>|<span class=\"revision-date\">${friendlyTimestamp}</span>|g" "${srcDir}/src/epub/text/colophon.xhtml" > /dev/null 2>&1

		if [ "${verbose}" = "true" ]; then
			printf " Done.\n"
		fi
	fi

	#See if we have necessary MARC relator in metadata
	for file in $(find "${srcDir}" -name "*.xhtml")
	do
		filename=$(basename "${file}")
		if [ "${filename}" = "introduction.xhtml" ]; then
			grep --quiet -i ">aui<" "${srcDir}/src/epub/content.opf"
			if [ $? -ne 0 ]; then

				grep --quiet -i ">win<" "${srcDir}/src/epub/content.opf"
				if [ $? -ne 0 ]; then
					if [ "${verbose}" = "true" ]; then
						printf "\t"
					fi
					printf "Warning: introduction.xhtml found, but no MARC relator 'aui' (Author of introduction, but not the chief author) or 'win' (Writer of introduction) \n" 1>&2
				fi
			fi
		fi

		if [ "${filename}" = "preface.xhtml" ]; then
			grep --quiet -i ">wpr<" "${srcDir}/src/epub/content.opf"
			if [ $? -ne 0 ]; then
				if [ "${verbose}" = "true" ]; then
					printf "\t"
				fi
				printf "Warning: preface.xhtml found, but no MARC relator 'wpr' (Writer of preface)\n" 1>&2
			fi
		fi

		if [ "${filename}" = "afterword.xhtml" ]; then
			grep --quiet -i ">aft<" "${srcDir}/src/epub/content.opf"
			if [ $? -ne 0 ]; then
				if [ "${verbose}" = "true" ]; then
					printf "\t"
				fi
				printf "Warning: afterword.xhtml found, but no MARC relator 'aft' (Author of colophon, afterword, etc.)\n" 1>&2
			fi
		fi

		if [ "${filename}" = "endnotes.xhtml" ]; then
			grep --quiet -i ">ann<" "${srcDir}/src/epub/content.opf"
			if [ $? -ne 0 ]; then
				if [ "${verbose}" = "true" ]; then
					printf "\t"
				fi
				printf "Warning: endnotes.xhtml found, but no MARC relator 'ann' (Annotator)\n" 1>&2
			fi
		fi

		if [ "${filename}" = "loi.xhtml" ]; then
			grep --quiet -i ">ill<" "${srcDir}/src/epub/content.opf"
			if [ $? -ne 0 ]; then
				if [ "${verbose}" = "true" ]; then
					printf "\t"
				fi
				printf "Warning: loi.xhtml found, but no MARC relator 'ill' (Illustrator)\n" 1>&2
			fi
		fi
	done

	#Check the language defined in content.opf against the body text
	language="$(xmlstarlet sel -N dc="http://purl.org/dc/elements/1.1/" -t -m "//dc:language" -v "." -n "${srcDir}/src/epub/content.opf")"

	for file in $(find "${srcDir}" -name "*.xhtml")
	do
		filename=$(basename "${file}")
		if [ "${filename}" != "colophon.xhtml" -a "${filename}" != "titlepage.xhtml" -a "${filename}" != "imprint.xhtml" -a "${filename}" != "halftitle.xhtml" -a "${filename}" != "uncopyright.xhtml" -a "${filename}" != "loi.xhtml" -a "${filename}" != "toc.xhtml" ]; then
			fileLanguage="$(grep --only-matching -E "<html[^<]+xml\:lang=\"[^\"]+\"" "${file}" | sed -E "s/.+xml\:lang=\"([^\"]+)\"/\1/")"
			if [ "${fileLanguage}" != "${language}" ]; then
				if [ "${verbose}" = "true" ]; then
					printf "\t"
				fi
				printf "Warning: ${filename} language is ${fileLanguage}, but content.opf language is ${language}!\n" 1>&2
			fi
		fi
	done

	#Check if there are non-typogrified quotes or em-dashes in metadata descriptions
	grep --quiet --null-data --extended-regexp "<meta id=\"long-description\" property=\"se:long-description\" refines=\"#description\">[^<]+?(['\"]|\-\-)[^<]+?</meta>" "${srcDir}/src/epub/content.opf"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: non-typogrified \", ', or -- detected in metadata long description!\n" 1>&2
	fi

	grep --quiet --null-data --extended-regexp "<dc:description id=\"description\">[^<]+?(['\"]|\-\-)[^<]+?</dc:description>" "${srcDir}/src/epub/content.opf"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: non-typogrified \", ', or -- detected in metadata description!\n" 1>&2
	fi

	#See if we have non-https source links
	grep --quiet -i -R "http://www.gutenberg.org/" "${srcDir}/src/"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: Non-https link detected: http://www.gutenberg.org\n" 1>&2
	fi

	grep --quiet -i -R "http://catalog.hathitrust.org/" "${srcDir}/src/"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: Non-https link detected: http://catalog.hathitrust.org\n" 1>&2
	fi

	grep --quiet -i -R "http://archive.org/" "${srcDir}/src/"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: Non-https link detected: http://archive.org/\n" 1>&2
	fi

	grep --quiet -E -i -R "id.loc.gov/authorities/names/[^\.]+.html" "${srcDir}/src/"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: LoC identifier with .html extension detected!\n" 1>&2
	fi

	grep --quiet -i -R "�" "${srcDir}/src/"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: UTF replacement character detected!  Transcription encoding error?\n" 1>&2
	fi

	grep --quiet -i -R "epub\:type=\"subtitle\"" "${srcDir}/src/epub/text/"
	if [ $? -eq 0 ]; then
		grep --quiet -i -R "span\[epub\|type\~=\"subtitle\"\]" "${srcDir}/src/epub/css/"
		if [ $? -ne 0 ]; then
			if [ "${verbose}" = "true" ]; then
				printf "\t"
			fi
			printf "Warning: Subtitles detected, but no subtitle CSS detected!\n" 1>&2
		fi
	fi

	grep --quiet -i -R "<p/>" "${srcDir}/src/"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: Empty <p> tag (<p/>) detected\n" 1>&2
	fi

	#Run some external checks
	"${unusedSelectorPath}" "${srcDir}"

	#Check for HTML entities in long-description
	grep --quiet -E -i "&amp;[a-z]+?;" "${srcDir}/src/epub/content.opf"
	if [ $? -eq 0 ]; then
		if [ "${verbose}" = "true" ]; then
			printf "\t"
		fi
		printf "Warning: HTML entites detected in metadata!  Use UTF characters where possible.\n" 1>&2
	fi

	grep --extended-regexp --no-filename --only-matching "<abbr[^<]+>" "${srcDir}/src/epub/text/"* | sed 's/<abbr class=\"//g' | sed 's/<abbr xml:lang="//g' | sed 's/">//g' | sed --regexp-extended "s/\s+/\n/g" | sort | uniq | while read class;
	do
		if [ "${class}" = "name" -o "${class}" = "era" -o "${class}" = "degree" -o "${class}" = "initialism" -o "${class}" = "acronym" -o "${class}" = "temperature" -o "${class}" = "postal" -o "${class}" = "timezone" ]; then
			grep --quiet "abbr\.${class}" "${srcDir}/src/epub/css/local.css"
			if [ $? -ne 0 ]; then
				if [ "${verbose}" = "true" ]; then
					printf "\t"
				fi
				printf "Warning: There seem to be an <abbr class=\"${class}\"> tag, but no abbr.${class} styling!\n" 1>&2
			fi
		fi
	done
done