find-unused-selectors

#!/usr/bin/env python3

import argparse
import glob
import sys
import os
import regex
import lxml.cssselect
import lxml.etree as etree

XHTML_NAMESPACES = {"xhtml": "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops", "z3998": "http://www.daisy.org/z3998/2012/vocab/structure/", "se": "http://standardebooks.org/vocab/1.0"}

def main():
	parser = argparse.ArgumentParser(description="Find unused local.css CSS selectors in Standard Ebook source directories.")
	parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
	parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory")
	args = parser.parse_args()

	for directory in args.directories:
		directory = os.path.abspath(directory)

		if not os.path.isdir(directory):
			print("Error: Not a directory: {}".format(directory), file=sys.stderr)
			exit(1)

		if args.verbose:
			print("Processing {} ...".format(directory), end="", flush=True)

		try:
			with open(os.path.join(directory, "src/epub/css/local.css"), encoding="utf-8") as file:
				css = file.read()
		except Exception:
			print("Error: Couldn't open CSS file: {}".format(os.path.join(directory, "src/epub/css/local.css")), file=sys.stderr)
			exit(1)

		# Remove actual content of css selectors
		css = regex.sub(r"{[^}]+}", "", css, flags=regex.MULTILINE)

		# Remove trailing commas
		css = regex.sub(r",", "", css)

		# Remove comments
		css = regex.sub(r"/\*.+?\*/", "", css, flags=regex.DOTALL)

		# Remove @ defines
		css = regex.sub(r"^@.+", "", css, flags=regex.MULTILINE)

		# Construct a dictionary of selectors
		selectors = set([line for line in css.splitlines() if line != ""])
		unused_selectors = set(selectors)

		# Get a list of .xhtml files to search
		filenames = glob.glob(os.path.join(directory, "src/epub/text/") + "*.xhtml")

		# Now iterate over each CSS selector and see if it's used in any of the files we found
		for selector in selectors:
			try:
				sel = lxml.cssselect.CSSSelector(selector, translator="html", namespaces=XHTML_NAMESPACES)
			except lxml.cssselect.ExpressionError:
				# This gets thrown if we use pseudo-elements, which lxml doesn't support
				unused_selectors.remove(selector)
				continue

			for filename in filenames:
				if not filename.endswith("titlepage.xhtml") and not filename.endswith("imprint.xhtml") and not filename.endswith("uncopyright.xhtml"):
					# We have to remove the default namespace declaration from our document, otherwise
					# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
					with open(filename, "r") as file:
						xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")

					tree = etree.fromstring(str.encode(xhtml))
					if len(tree.xpath(sel.path, namespaces=XHTML_NAMESPACES)) > 0:
						unused_selectors.remove(selector)
						break

		# Did we find any unused selectors?
		if len(unused_selectors) > 0:
			if args.verbose:
				print("")
			else:
				print(directory)

			for selector in unused_selectors:
				print("\t" + selector)
		elif args.verbose:
			print(" OK")

if __name__ == "__main__":
	main()