Skip to content

Commit

Permalink
Merge pull request #237 from gutenbergtools/v12
Browse files Browse the repository at this point in the history
0.12.45
  • Loading branch information
eshellman authored Sep 20, 2024
2 parents c2e4704 + 7cdf520 commit b13c351
Show file tree
Hide file tree
Showing 13 changed files with 129 additions and 32 deletions.
17 changes: 17 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
0.12.45 September 18, 2024
- generated covers are now 1600x2400 to comply with Apple Books recommended minimum width and DP guidelines https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Post-Processing_FAQ#Information_for_all_types_of_cover #234
- added accessibility metadata to EPUB3 content.ocf as suggested by ACE
- stub implementation to allow assertions of good alt text in config.
- added aria labels and roles to nav elements of EPUB3 content.ocf and toc.xhtml
- added lang attribute to wrapper file html elements as suggested by ACE
- fix opengraph urls in HTML metadata #235
- update cchardet to solve problems installing on python 3.11
- alt-text logging is restructured
- empty alt-text warnings are now suppressed in figures
- empty alt-text warnings are now suppressed when role='presentation' or aria-labelledby attributes are present
- the alt text examination is moved from the Spider module to the HTMLParser module.
- ids are assigned to all img elements to facilitate alt-text mitigation.
- alt-text logging is improved.
- empty alt-text warnings now reference a newly added doc page: https://github.com/gutenbergtools/ebookmaker/blob/master/docs/alt-text.md
- bug in undeployed 0.12.44 fixed

0.12.43 May 22, 2024
- fixed chunker bugs:
- no longer emits empty chunks (was happening with large child elements of body) #224
Expand Down
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ pylint = "*"

[packages]
e1839a8 = {path = ".",editable = true}
libgutenberg = "==0.10.25"
libgutenberg = "==0.10.26"
psycopg2 = "*"
docutils = ">=0.18.1"
html5lib = "*"
cchardet = "==2.2.0a2"
ebookmaker = {file = ".", editable = true}
19 changes: 19 additions & 0 deletions docs/alt-text.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Ebookmaker encourages proper use of the alt attribute to make books with images more accessible to the reading disabled. Ebookmaker ensures that every `img` element has an `alt` attribute and issues warnings if the alt attribute is empty.

Often the `alt` attribute should be left empty:

1. when the image is purely decorative or used to help with the visual presentation of text. It would be disruptive to a person using text-to-speach or a braille reader to have the image described. In such a case, add a`role` attribute with value `presentation`: `<img src="image.png" alt="" role="presentation">` and the warning message will be suppressed.

2. when the image is well described by associated text. Often an image from a book will appear above a descriptive caption. For this reason, Ebookmaker will not emit a warning message if it appears inside a `<figure>` element containing a `<figcaption>`, or if the img has an `aria-labelledby` attribute: `<img src="image.png" alt="" aria-labelledby="id_for_label">` But when relying on a caption text, make sure it is describing what a sighted reader sees. Some captions comment on the image without describing it.


Accessibiity Tutorial:
https://www.w3.org/WAI/tutorials/images/

Using `aria-labelledby`:
https://www.w3.org/WAI/WCAG21/Techniques/aria/ARIA16

Other helpful guides:
https://publishers.asn.au/BooksWithoutBarriers
https://axesslab.com/alt-texts/
https://accessibility.huit.harvard.edu/describe-content-images
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = ebookmaker
version = 0.12.43
version = 0.12.45

[options]
package_dir=
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from setuptools import setup

VERSION = '0.12.43'
VERSION = '0.12.45'

if __name__ == "__main__":

Expand Down Expand Up @@ -46,7 +46,7 @@
'requests',
'six>=1.4.1',
'libgutenberg[covers]>=0.10.22',
'cchardet',
'cchardet==2.2.0a2',
'beautifulsoup4',
'html5lib',
],
Expand Down Expand Up @@ -82,9 +82,9 @@
"Operating System :: OS Independent",
"Intended Audience :: Other Audience",
"Programming Language :: Python",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
],

platforms = 'OS-independent'
Expand Down
2 changes: 1 addition & 1 deletion src/ebookmaker/EbookMaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def elect_coverpage(spider, url, dc):

def generate_cover(dir, dc):
try:
cover_image = Cover.draw(dc, cover_width=1200, cover_height=1800)
cover_image = Cover.draw(dc, cover_width=1600, cover_height=2400)
cover_url = os.path.join(dir, make_output_filename('cover', dc))
with open(cover_url, 'wb+') as cover:
cover_image.save(cover)
Expand Down
3 changes: 0 additions & 3 deletions src/ebookmaker/Spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from ebookmaker.CommonCode import Options
from ebookmaker.ParserFactory import ParserFactory

NO_ALT_TEXT = 'Empty alt text for %s. See https://www.w3.org/WAI/tutorials/images/ for info on accessible alt text.'

options = Options()

Expand Down Expand Up @@ -160,8 +159,6 @@ def recursive_parse(self, root_attribs):
self.enqueue(queue, depth + 1, new_attribs, True)

elif tag in (NS.xhtml.img, NS.xhtml.style):
if 'alt' in elem.attrib and elem.attrib['alt'] == '':
warning(NO_ALT_TEXT, url)
if tag == NS.xhtml.style or self.is_image(new_attribs):
self.enqueue(queue, depth, new_attribs, False)
else:
Expand Down
2 changes: 1 addition & 1 deletion src/ebookmaker/Version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION = '0.12.43'
VERSION = '0.12.45'
GENERATOR = 'Ebookmaker %s by Project Gutenberg'
42 changes: 40 additions & 2 deletions src/ebookmaker/parsers/HTMLParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Distributable under the GNU General Public License Version 3 or newer.
"""

import re
import unicodedata

Expand Down Expand Up @@ -148,6 +149,8 @@

RE_NOT_XML_NAMECHAR = re.compile(r'[^\w.-]')

NO_ALT_TEXT = 'Empty alt text for %s. See https://github.com/gutenbergtools/ebookmaker/blob/master/docs/alt-text.md'

def nfc(_str):
return unicodedata.normalize('NFC', EntitySubstitution.substitute_xml(_str))

Expand Down Expand Up @@ -224,6 +227,8 @@ def ids_and_names(xhtml):
yield node
for node in xpath(xhtml, "//xhtml:a[@name]"):
yield node
for node in xpath(xhtml, "//xhtml:img[not(@id)]"):
yield node

# move anchor name to id
# 'id' values are more strict than 'name' values
Expand All @@ -241,6 +246,9 @@ def ids_and_names(xhtml):
del anchor.attrib['id']
if NS.xml.id in anchor.attrib:
del anchor.attrib[NS.xml.id]
if not id_:
# we want every img to have an id_
id_ = f"img_{anchor.get('src')}"

id_ = self._fix_id(id_)

Expand All @@ -250,8 +258,15 @@ def ids_and_names(xhtml):

# well-formed id
if id_ in self.seen_ids:
error("Dropping duplicate id '%s' in %s" % (id_, self.attribs.url))
continue
if anchor.tag == NS.xhtml.img:
# more than one img referencing an image file
n = 1
while f'{id_}_{n}' in self.seen_ids:
n += 1
id_ = f'{id_}_{n}'
else:
error("dropping duplicate id '%s' in %s" % (id_, self.attribs.url))
continue

self.seen_ids.add(id_)
anchor.set('id', id_)
Expand Down Expand Up @@ -457,6 +472,29 @@ def captionid():
figure.attrib['role'] = 'figure'
figure.attrib['aria-labelledby'] = caption.attrib['id']
break

# process alt tags
for elem in xpath(self.xhtml, "//xhtml:img"):
infigure = False
labeled = elem.get('aria-labelledby')
if labeled and labeled in self.seen_ids:
continue
if elem.get('role') == 'presentation':
continue
alt = elem.get('alt').split('\r\n')[0]
if not alt:
# check if it's in a figure
parent = elem.getparent()
while parent is not None:
if parent.tag == NS.xhtml.figure:
infigure = True
break
parent = parent.getparent()
if not infigure:
warning(NO_ALT_TEXT, elem.get('src'))
id_ = elem.get('id')
info(f'[ALTTEXT]{self.attribs.url},{id_},{alt},{elem.get("src")},{infigure}')


##### cleanup #######

Expand Down
6 changes: 3 additions & 3 deletions src/ebookmaker/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@

STYLE_LINK = '<link href="pgepub.css" rel="stylesheet"/>'
IMAGE_WRAPPER = """<?xml version="1.0"?>{doctype}
<html xmlns="http://www.w3.org/1999/xhtml">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head>
<title>{title}</title>
{style}
Expand Down Expand Up @@ -128,8 +128,8 @@ class ParserAttributes(object): # pylint: disable=too-few-public-methods
Typical attributes held here would be:
- url
- orig_url
- mediatpye
- orig_mediatpye
- mediatype
- orig_mediatype
- referrer
- id
Expand Down
36 changes: 31 additions & 5 deletions src/ebookmaker/writers/Epub3Writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@
}
"""

def alt_text_good(book_id):
# stub implementation which allows listing books with good alt text in config file
return str(book_id) in options.good_alt_text.split() if hasattr(
options, 'good_alt_text') else False


class OEBPSContainer(EpubWriter.OEBPSContainer):
""" Class representing an OEBPS Container. """

Expand All @@ -123,7 +129,7 @@ def add_cover_wrapper(self, parser):
(cover_x, cover_y) = parser.get_image_dimen()
wrapper = f'''
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head>
<title>"Cover"</title>
<link href="pgepub.css" rel="stylesheet"/>
Expand Down Expand Up @@ -213,7 +219,7 @@ def _make_navmap(self, toc):
""" Build the toc. """
em = self.elementmaker

root = em.nav(**{EPUB_TYPE: 'toc'})
root = em.nav(**{EPUB_TYPE: 'toc', 'role': 'doc-toc', 'aria-label': 'Table of Contents'})
toctop = em.ol()
root.append(toctop)

Expand Down Expand Up @@ -250,7 +256,7 @@ def _make_navmap(self, toc):
def _make_pagelist(self, toc):
""" Build the page list. """
em = self.elementmaker
root = em.nav(**{EPUB_TYPE: 'landmarks'})
root = em.nav(**{EPUB_TYPE: 'landmarks', 'aria-label': 'Page List'})
pagelist_top = em.ol(**{'id': 'pages', 'class': 'pagelist'})
root.append(pagelist_top)

Expand All @@ -271,6 +277,7 @@ class ContentOPF(object):

def __init__(self):
self.nsmap = gg.build_nsmap('opf dc dcterms xsi')
self.lang = None

# FIXME: remove this when lxml is fixed
# workaround for lxml fat-fingering the default attribute namespaces
Expand All @@ -295,7 +302,7 @@ def __unicode__(self):
assert len(self.spine) > 0, 'No spine item in content.opf.'

package = self.opf.package(
**{'version': '3.0', 'unique-identifier': 'id'}) # FIXME add version to instance
**{'version': '3.0', 'unique-identifier': 'id', NS.xml.lang: self.lang})
package.append(self.metadata)
package.append(self.manifest)
package.append(self.spine)
Expand Down Expand Up @@ -465,7 +472,9 @@ def metadata_item(self, dc):

for language in dc.languages:
self.metadata.append(dcterms.language(language.id))

if not self.lang:
self.lang = language.id # assume first lang is main lang

for subject in dc.subjects:
self.metadata.append(dcterms.subject(subject.subject))

Expand All @@ -484,6 +493,23 @@ def metadata_item(self, dc):
source = urllib.parse.urljoin(options.config.PGURL, source)

self.metadata.append(dcterms.source(source))

# accessibility Metadata
self.metadata.append(self.opf.meta('textual', {'property': 'schema:accessMode'}))
self.metadata.append(self.opf.meta('readingOrder', {
'property': 'schema:accessibilityFeature'}))
self.metadata.append(self.opf.meta('none', {'property': 'schema:accessibilityHazard'}))
if alt_text_good(dc.project_gutenberg_id):
self.metadata.append(self.opf.meta('alternativeText', {
'property': 'schema:accessibilityFeature'}))
a11y_summary = 'This publication has complete alternative text descriptions.'
else:
a11y_summary = 'This publication may not have complete alternative text descriptions.'
# TODO: reimplement this indicators when audio included
self.metadata.append(self.opf.meta('textual,visual', {
'property': 'schema:accessModeSufficient'}))
self.metadata.append(self.opf.meta(a11y_summary, {
'property': 'schema:accessibilitySummary'}))


def add_coverpage(self, url, id_):
Expand Down
21 changes: 10 additions & 11 deletions src/ebookmaker/writers/HTMLWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@


from libgutenberg.Logger import debug, exception, info, error, warning
from libgutenberg.GutenbergGlobals import PG_URL

from ebookmaker import writers
from ebookmaker.EbookMaker import FILENAMES
from ebookmaker.CommonCode import Options
from ebookmaker.parsers import webify_url, CSSParser
from ebookmaker.parsers.CSSParser import cssutils
Expand Down Expand Up @@ -135,6 +137,12 @@ def serialize(xhtml):
return htmlbytes


def canonical_url(dc, type_):
textnum = dc.project_gutenberg_id or '00000'
filename = FILENAMES.get(type_, 'pg{id}.' + type_).format(id=textnum)
return f'{PG_URL}cache/epub/{textnum}/{filename}'


class Writer(writers.HTMLishWriter):
""" Class for writing HTML files. """
VALIDATOR = 'HTML_VALIDATOR'
Expand All @@ -159,11 +167,8 @@ def add_moremeta(self, job, tree, url):
for dcmitype in job.dc.dcmitypes:
self.add_prop(tree, "og:type", dcmitype.id)
info(job.main)
web_url = urljoin(job.dc.canonical_url, job.outputfile)
self.add_prop(tree, "og:url", web_url)
canonical_cover_name = 'pg%s.cover.medium.jpg' % job.dc.project_gutenberg_id
cover_url = urljoin(job.dc.canonical_url, canonical_cover_name)
self.add_prop(tree, "og:image", cover_url)
self.add_prop(tree, "og:url", canonical_url(job.dc, job.type))
self.add_prop(tree, "og:image", canonical_url(job.dc, 'cover.medium'))

# fix empty title elements
for title in xpath(tree, '//xhtml:title[not(text())]'):
Expand Down Expand Up @@ -338,12 +343,6 @@ def xhtml_to_html(html):
for elem in xpath(html, f"//xhtml:{tag}[@{attr}]"):
del elem.attrib[attr]

# set required attributes
attrs_to_fill = [('img', 'alt', '')]
for (tag, attr, fill) in attrs_to_fill:
for elem in xpath(html, f"//xhtml:{tag}[not(@{attr})]"):
elem.set(attr, fill)

# remove not_empty attributes
nullattrs_to_remove = ['height', 'width']
for attr in nullattrs_to_remove:
Expand Down
Binary file modified tests/out/69030-cover.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit b13c351

Please sign in to comment.