Merge pull request #237 from gutenbergtools/v12

0.12.45
gutenbergtools · Sep 20, 2024 · b13c351 · b13c351
2 parents c2e4704 + 7cdf520
commit b13c351
Show file tree

Hide file tree

Showing 13 changed files with 129 additions and 32 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,20 @@
+0.12.45 September 18, 2024
+- generated covers are now 1600x2400 to comply with Apple Books recommended minimum width and DP guidelines https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Post-Processing_FAQ#Information_for_all_types_of_cover #234
+- added accessibility metadata to EPUB3 content.ocf as suggested by ACE
+- stub implementation to allow assertions of good alt text in config.
+- added aria labels and roles to nav elements of EPUB3 content.ocf and toc.xhtml
+- added lang attribute to wrapper file html elements as suggested by ACE
+- fix opengraph urls in HTML metadata #235
+- update cchardet to solve problems installing on python 3.11
+- alt-text logging is restructured
+    - empty alt-text warnings are now suppressed in figures
+    - empty alt-text warnings are now suppressed when role='presentation' or aria-labelledby attributes are present
+    - the alt text examination is moved from the Spider module to the HTMLParser module.
+    - ids are assigned to all img elements to facilitate alt-text mitigation. 
+    - alt-text logging is improved.
+    - empty alt-text warnings now reference a newly added doc page: https://github.com/gutenbergtools/ebookmaker/blob/master/docs/alt-text.md
+- bug in undeployed 0.12.44 fixed
+
 0.12.43 May 22, 2024
 - fixed chunker bugs:
     - no longer emits empty chunks (was happening with large child elements of body) #224

diff --git a/Pipfile b/Pipfile
@@ -8,8 +8,9 @@ pylint = "*"
 
 [packages]
 e1839a8 = {path = ".",editable = true}
-libgutenberg = "==0.10.25"
+libgutenberg = "==0.10.26"
 psycopg2 = "*"
 docutils = ">=0.18.1"
 html5lib = "*"
+cchardet = "==2.2.0a2"
 ebookmaker = {file = ".", editable = true}
diff --git a/docs/alt-text.md b/docs/alt-text.md
@@ -0,0 +1,19 @@
+Ebookmaker encourages proper use of the alt attribute to make books with images more accessible to the reading disabled. Ebookmaker ensures that every `img` element has an `alt` attribute and issues warnings if the alt attribute is empty.
+
+Often the `alt` attribute should be left empty:
+
+1. when the image is purely decorative or used to help with the visual presentation of text. It would be disruptive to a person using text-to-speach or a braille reader to have the image described. In such a case, add a`role` attribute with value `presentation`: `<img src="image.png" alt="" role="presentation">` and the warning message will be suppressed.
+
+2. when the image is well described by associated text. Often an image from a book will appear above a descriptive caption. For this reason, Ebookmaker will not emit a warning message if it appears inside a `<figure>` element containing a `<figcaption>`, or if the img has an `aria-labelledby` attribute: `<img src="image.png" alt="" aria-labelledby="id_for_label">` But when relying on a caption text, make sure it is describing what a sighted reader sees. Some captions comment on the image without describing it.
+
+
+Accessibiity Tutorial:
+https://www.w3.org/WAI/tutorials/images/
+
+Using `aria-labelledby`:
+https://www.w3.org/WAI/WCAG21/Techniques/aria/ARIA16
+
+Other helpful guides:
+https://publishers.asn.au/BooksWithoutBarriers
+https://axesslab.com/alt-texts/
+https://accessibility.huit.harvard.edu/describe-content-images
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = ebookmaker
-version = 0.12.43
+version = 0.12.45
 
 [options]
 package_dir=

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 from setuptools import setup
 
-VERSION = '0.12.43'
+VERSION = '0.12.45'
 
 if __name__ == "__main__":
 
@@ -46,7 +46,7 @@
             'requests',
             'six>=1.4.1',
             'libgutenberg[covers]>=0.10.22',
-            'cchardet',
+            'cchardet==2.2.0a2',
             'beautifulsoup4',
             'html5lib',
         ],
@@ -82,9 +82,9 @@
             "Operating System :: OS Independent",
             "Intended Audience :: Other Audience",
             "Programming Language :: Python",
-            "Programming Language :: Python :: 3.7",
-            "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
+            "Programming Language :: Python :: 3.10",
+            "Programming Language :: Python :: 3.11",
         ],
 
         platforms = 'OS-independent'

diff --git a/src/ebookmaker/EbookMaker.py b/src/ebookmaker/EbookMaker.py
@@ -196,7 +196,7 @@ def elect_coverpage(spider, url, dc):
 
 def generate_cover(dir, dc):
     try:
-        cover_image = Cover.draw(dc, cover_width=1200, cover_height=1800)
+        cover_image = Cover.draw(dc, cover_width=1600, cover_height=2400)
         cover_url = os.path.join(dir, make_output_filename('cover', dc))
         with open(cover_url, 'wb+') as cover:
             cover_image.save(cover)

diff --git a/src/ebookmaker/Spider.py b/src/ebookmaker/Spider.py
@@ -27,7 +27,6 @@
 from ebookmaker.CommonCode import Options
 from ebookmaker.ParserFactory import ParserFactory
 
-NO_ALT_TEXT = 'Empty alt text for %s. See https://www.w3.org/WAI/tutorials/images/ for info on accessible alt text.'
 
 options = Options()
 
@@ -160,8 +159,6 @@ def recursive_parse(self, root_attribs):
                         self.enqueue(queue, depth + 1, new_attribs, True)
 
                 elif tag in (NS.xhtml.img, NS.xhtml.style):
-                    if 'alt' in elem.attrib and elem.attrib['alt'] == '':
-                        warning(NO_ALT_TEXT, url)
                     if tag == NS.xhtml.style or self.is_image(new_attribs):
                         self.enqueue(queue, depth, new_attribs, False)
                     else:

diff --git a/src/ebookmaker/Version.py b/src/ebookmaker/Version.py
@@ -1,2 +1,2 @@
-VERSION = '0.12.43'
+VERSION = '0.12.45'
 GENERATOR = 'Ebookmaker %s by Project Gutenberg'
diff --git a/src/ebookmaker/parsers/HTMLParser.py b/src/ebookmaker/parsers/HTMLParser.py
@@ -10,6 +10,7 @@
 Distributable under the GNU General Public License Version 3 or newer.
 
 """
+
 import re
 import unicodedata
 
@@ -148,6 +149,8 @@
 
 RE_NOT_XML_NAMECHAR = re.compile(r'[^\w.-]')
 
+NO_ALT_TEXT = 'Empty alt text for %s. See https://github.com/gutenbergtools/ebookmaker/blob/master/docs/alt-text.md'
+
 def nfc(_str):
     return unicodedata.normalize('NFC', EntitySubstitution.substitute_xml(_str))
 
@@ -224,6 +227,8 @@ def ids_and_names(xhtml):
                 yield node
             for node in xpath(xhtml, "//xhtml:a[@name]"):
                 yield node
+            for node in xpath(xhtml, "//xhtml:img[not(@id)]"):
+                yield node
 
         # move anchor name to id
         # 'id' values are more strict than 'name' values
@@ -241,6 +246,9 @@ def ids_and_names(xhtml):
                 del anchor.attrib['id']
             if NS.xml.id in anchor.attrib:
                 del anchor.attrib[NS.xml.id]
+            if not id_:
+                # we want every img to have an id_
+                id_ = f"img_{anchor.get('src')}"
 
             id_ = self._fix_id(id_)
 
@@ -250,8 +258,15 @@ def ids_and_names(xhtml):
 
             # well-formed id
             if id_ in self.seen_ids:
-                error("Dropping duplicate id '%s' in %s" % (id_, self.attribs.url))
-                continue
+                if anchor.tag == NS.xhtml.img:
+                    # more than one img referencing an image file
+                    n = 1
+                    while f'{id_}_{n}' in self.seen_ids:
+                        n += 1
+                    id_ = f'{id_}_{n}'
+                else:
+                    error("dropping duplicate id '%s' in %s" % (id_, self.attribs.url))
+                    continue
 
             self.seen_ids.add(id_)
             anchor.set('id', id_)
@@ -457,6 +472,29 @@ def captionid():
                     figure.attrib['role'] = 'figure'
                     figure.attrib['aria-labelledby'] = caption.attrib['id']
                     break
+
+        # process alt tags
+        for elem in xpath(self.xhtml, "//xhtml:img"):
+            infigure = False
+            labeled = elem.get('aria-labelledby')
+            if labeled and labeled in self.seen_ids:
+                continue
+            if elem.get('role') == 'presentation':
+                continue
+            alt = elem.get('alt').split('\r\n')[0]
+            if not alt:
+                # check if it's in a figure
+                parent = elem.getparent()              
+                while parent is not None:
+                    if parent.tag == NS.xhtml.figure:
+                        infigure = True
+                        break
+                    parent = parent.getparent()
+                if not infigure:
+                    warning(NO_ALT_TEXT, elem.get('src'))
+            id_ = elem.get('id')
+            info(f'[ALTTEXT]{self.attribs.url},{id_},{alt},{elem.get("src")},{infigure}')
+
 
         ##### cleanup #######
 

diff --git a/src/ebookmaker/parsers/__init__.py b/src/ebookmaker/parsers/__init__.py
@@ -81,7 +81,7 @@
 
 STYLE_LINK = '<link href="pgepub.css" rel="stylesheet"/>'
 IMAGE_WRAPPER = """<?xml version="1.0"?>{doctype}
-<html xmlns="http://www.w3.org/1999/xhtml">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
   <head>
     <title>{title}</title>
     {style}
@@ -128,8 +128,8 @@ class ParserAttributes(object): # pylint: disable=too-few-public-methods
     Typical attributes held here would be:
       - url
       - orig_url
-      - mediatpye
-      - orig_mediatpye
+      - mediatype
+      - orig_mediatype
       - referrer
       - id
 

diff --git a/src/ebookmaker/writers/Epub3Writer.py b/src/ebookmaker/writers/Epub3Writer.py
@@ -112,6 +112,12 @@
 }
 """
 
+def alt_text_good(book_id):
+    # stub implementation which allows listing books with good alt text in config file
+    return str(book_id) in options.good_alt_text.split() if hasattr(
+        options, 'good_alt_text') else False
+
+
 class OEBPSContainer(EpubWriter.OEBPSContainer):
     """ Class representing an OEBPS Container. """
 
@@ -123,7 +129,7 @@ def add_cover_wrapper(self, parser):
         (cover_x, cover_y) = parser.get_image_dimen()
         wrapper = f'''
 <!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
   <head>
     <title>"Cover"</title>
     <link href="pgepub.css" rel="stylesheet"/>
@@ -213,7 +219,7 @@ def _make_navmap(self, toc):
         """ Build the toc. """
         em = self.elementmaker
 
-        root = em.nav(**{EPUB_TYPE: 'toc'})
+        root = em.nav(**{EPUB_TYPE: 'toc', 'role': 'doc-toc', 'aria-label': 'Table of Contents'})
         toctop = em.ol()
         root.append(toctop)
 
@@ -250,7 +256,7 @@ def _make_navmap(self, toc):
     def _make_pagelist(self, toc):
         """ Build the page list. """
         em = self.elementmaker
-        root = em.nav(**{EPUB_TYPE: 'landmarks'})
+        root = em.nav(**{EPUB_TYPE: 'landmarks', 'aria-label': 'Page List'})
         pagelist_top = em.ol(**{'id': 'pages', 'class': 'pagelist'})
         root.append(pagelist_top)
 
@@ -271,6 +277,7 @@ class ContentOPF(object):
 
     def __init__(self):
         self.nsmap = gg.build_nsmap('opf dc dcterms xsi')
+        self.lang = None
 
         # FIXME: remove this when lxml is fixed
         # workaround for lxml fat-fingering the default attribute namespaces
@@ -295,7 +302,7 @@ def __unicode__(self):
         assert len(self.spine) > 0, 'No spine item in content.opf.'
 
         package = self.opf.package(
-            **{'version': '3.0', 'unique-identifier': 'id'}) # FIXME add version to instance
+            **{'version': '3.0', 'unique-identifier': 'id', NS.xml.lang: self.lang})
         package.append(self.metadata)
         package.append(self.manifest)
         package.append(self.spine)
@@ -465,7 +472,9 @@ def metadata_item(self, dc):
 
         for language in dc.languages:
             self.metadata.append(dcterms.language(language.id))
-
+            if not self.lang:
+                self.lang = language.id  # assume first lang is main lang
+
         for subject in dc.subjects:
             self.metadata.append(dcterms.subject(subject.subject))
 
@@ -484,6 +493,23 @@ def metadata_item(self, dc):
                 source = urllib.parse.urljoin(options.config.PGURL, source)
 
         self.metadata.append(dcterms.source(source))
+
+        # accessibility Metadata
+        self.metadata.append(self.opf.meta('textual', {'property': 'schema:accessMode'}))
+        self.metadata.append(self.opf.meta('readingOrder', {
+            'property': 'schema:accessibilityFeature'}))
+        self.metadata.append(self.opf.meta('none', {'property': 'schema:accessibilityHazard'}))
+        if alt_text_good(dc.project_gutenberg_id):
+            self.metadata.append(self.opf.meta('alternativeText', {
+                'property': 'schema:accessibilityFeature'}))
+            a11y_summary = 'This publication has complete alternative text descriptions.'
+        else:
+            a11y_summary = 'This publication may not have complete alternative text descriptions.'
+        # TODO: reimplement this indicators when audio included
+        self.metadata.append(self.opf.meta('textual,visual', {
+            'property': 'schema:accessModeSufficient'}))
+        self.metadata.append(self.opf.meta(a11y_summary, {
+            'property': 'schema:accessibilitySummary'}))
 
 
     def add_coverpage(self, url, id_):

diff --git a/src/ebookmaker/writers/HTMLWriter.py b/src/ebookmaker/writers/HTMLWriter.py
@@ -23,8 +23,10 @@
 
 
 from libgutenberg.Logger import debug, exception, info, error, warning
+from libgutenberg.GutenbergGlobals import PG_URL
 
 from ebookmaker import writers
+from ebookmaker.EbookMaker import FILENAMES
 from ebookmaker.CommonCode import Options
 from ebookmaker.parsers import webify_url, CSSParser
 from ebookmaker.parsers.CSSParser import cssutils
@@ -135,6 +137,12 @@ def serialize(xhtml):
     return htmlbytes
 
 
+def canonical_url(dc, type_):
+    textnum = dc.project_gutenberg_id or '00000'
+    filename = FILENAMES.get(type_, 'pg{id}.' + type_).format(id=textnum)
+    return f'{PG_URL}cache/epub/{textnum}/{filename}'
+
+
 class Writer(writers.HTMLishWriter):
     """ Class for writing HTML files. """
     VALIDATOR = 'HTML_VALIDATOR'
@@ -159,11 +167,8 @@ def add_moremeta(self, job, tree, url):
         for dcmitype in job.dc.dcmitypes:
             self.add_prop(tree, "og:type", dcmitype.id)
         info(job.main)
-        web_url = urljoin(job.dc.canonical_url, job.outputfile)
-        self.add_prop(tree, "og:url", web_url)
-        canonical_cover_name = 'pg%s.cover.medium.jpg' % job.dc.project_gutenberg_id
-        cover_url = urljoin(job.dc.canonical_url, canonical_cover_name)
-        self.add_prop(tree, "og:image", cover_url)
+        self.add_prop(tree, "og:url", canonical_url(job.dc, job.type))
+        self.add_prop(tree, "og:image", canonical_url(job.dc, 'cover.medium'))
 
         # fix empty title elements
         for title in xpath(tree, '//xhtml:title[not(text())]'):
@@ -338,12 +343,6 @@ def xhtml_to_html(html):
             for elem in xpath(html, f"//xhtml:{tag}[@{attr}]"):
                 del elem.attrib[attr]
 
-        # set required attributes
-        attrs_to_fill = [('img', 'alt', '')]
-        for (tag, attr, fill) in attrs_to_fill:
-            for elem in xpath(html, f"//xhtml:{tag}[not(@{attr})]"):
-                elem.set(attr, fill)
-
         # remove not_empty attributes
         nullattrs_to_remove = ['height', 'width']
         for attr in nullattrs_to_remove:

diff --git a/tests/out/69030-cover.png b/tests/out/69030-cover.png