feat: improve checking of OCF file name characters

- significantly refactored `OCFFilenameChecker`: - now based on ICU4J for more up-to-date Unicode support - iterate over the input string codepoints (`int`) instead of code units (`char`) for better support of charactres outside the BMP. - rely on ICU UnicodeSet to define the set of forbidden characters - improve the message reported for forbidden characters (notably for non-printable characters) - the class now implements the `Checker` interface - add unit tests for the `OCFFilenameChecker` only in a new feature file `filename-checker.feature`, located along with the EPUB 3 OCF functional tests, for easier lookup. - add a few functional tests, to make sure `OCFFilenameChecker` is correctly called when checking full publications or single package documents. - update ICU4J to v72.1 Fixes #1240, fixes #1292, fixes #1302
w3c · Nov 17, 2022 · b6ac8ea · b6ac8ea
1 parent 5c35fa0
commit b6ac8ea
Show file tree

Hide file tree

Showing 34 changed files with 515 additions and 248 deletions.
diff --git a/pom.xml b/pom.xml
@@ -173,7 +173,7 @@
         <dependency>
             <groupId>com.ibm.icu</groupId>
             <artifactId>icu4j</artifactId>
-            <version>70.1</version>
+            <version>72.1</version>
         </dependency>
         <dependency>
             <groupId>net.sf.saxon</groupId>

diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java
@@ -287,8 +287,6 @@ private boolean checkContainerStructure(OCFCheckerState state)
       List<String> directories = new LinkedList<>();
 
       // Loop through the entries
-      OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
-      // FIXME catch IAE MALFORMED entries
       for (OCFResource resource : resourcesProvider)
       {
         Preconditions.checkNotNull(resource.getPath());
@@ -318,7 +316,7 @@ else if (resources.containsKey(
         else
         {
           // Check file name requirements
-          filenameChecker.checkCompatiblyEscaped(resource.getPath());
+          new OCFFilenameChecker(resource.getPath(), state.context().build()).check();;
 
           // report entry metadata
           reportFeatures(resource.getProperties());

diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java
@@ -1,127 +1,162 @@
 package com.adobe.epubcheck.ocf;
 
+import java.util.LinkedHashSet;
 import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.w3c.epubcheck.core.Checker;
 
 import com.adobe.epubcheck.api.EPUBLocation;
 import com.adobe.epubcheck.api.Report;
 import com.adobe.epubcheck.messages.MessageId;
 import com.adobe.epubcheck.opf.ValidationContext;
 import com.adobe.epubcheck.util.EPUBVersion;
-import com.google.common.collect.ImmutableSet;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableMap;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UCharacterIterator;
+import com.ibm.icu.text.UForwardCharacterIterator;
+import com.ibm.icu.text.UnicodeSet;
 
-//FIXME 2022 update related PKG-* messages to contain the file name string
-public final class OCFFilenameChecker
+public final class OCFFilenameChecker implements Checker
 {
-  private static final Set<String> RESTRICTED_30_CHARACTER_SET = ImmutableSet.of("PRIVATE_USE_AREA",
-      "ARABIC_PRESENTATION_FORMS_A", "SPECIALS", "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
-      "SUPPLEMENTARY_PRIVATE_USE_AREA_B", "VARIATION_SELECTORS_SUPPLEMENT", "TAGS");
+
+  private static final UnicodeSet ASCII = new UnicodeSet("[:ascii:]").freeze();
+
+  private static final UnicodeSet DISALLOWED_EPUB2 = new UnicodeSet()
+      // .add(0x002F)// SOLIDUS '/' -- allowed as path separator
+      .add(0x0022)// QUOTATION MARK '"'
+      .add(0x002A)// ASTERISK '*'
+      // .add(0x002E)// FULL STOP '.' -- only disallowed as the last character
+      .add(0x003A)// COLON ':'
+      .add(0x003C)// LESS-THAN SIGN '<'
+      .add(0x003E)// GREATER-THAN SIGN '>'
+      .add(0x003F)// QUESTION MARK '?'
+      .add(0x005C)// REVERSE SOLIDUS '\'
+      .freeze();
+
+  private static final ImmutableMap<String, UnicodeSet> DISALLOWED_EPUB3 = new ImmutableMap.Builder<String, UnicodeSet>()
+      .put("ASCII", new UnicodeSet() //
+          .addAll(DISALLOWED_EPUB2)// all disallowed in EPUB 2.0.1
+          .add(0x007C) // VERTICAL LINE '|'
+          .freeze())
+      .put("NON CHARACTER", new UnicodeSet("[:Noncharacter_Code_Point=Yes:]")//
+          .freeze())
+      .put("CONTROL", new UnicodeSet().add(0x007F) // DEL
+          .addAll(0x0000, 0x001F) // C0 range
+          .addAll(0x0080, 0x009F) // C1 range
+          .freeze())
+      .put("PRIVATE USE", new UnicodeSet() //
+          .addAll(0xE000, 0xF8FF) // Private Use Area
+          .addAll(0xF0000, 0xFFFFF) // Supplementary Private Use Area-A
+          .addAll(0x100000, 0x10FFFF) // Supplementary Private Use Area-B
+          .freeze())
+      .put("SPECIALS", new UnicodeSet() //
+          .addAll(0xFFF0, 0xFFFF) // Specials Blocks
+          .freeze())
+      .put("DEPRECATED", new UnicodeSet() //
+          .add(0xE0001)// LANGUAGE TAG
+          // .add(0xE007F)// CANCEL TAG -- reinstated in Emoji tag sequences
+          .freeze())
+      .build();
+
+  private static String toString(int codepoint, String setName)
+  {
+    assert setName != null;
+    StringBuilder result = new StringBuilder().append(String.format("U+%04X ", codepoint));
+    if ("ASCII".equals(setName))
+    {
+      result.append('(').append(UCharacter.toString(codepoint)).append(')');
+    }
+    else
+    {
+      String characterName = UCharacter.getName(codepoint);
+      if (characterName != null)
+      {
+        result.append(characterName).append(' ');
+      }
+      result.append('(').append(setName).append(')');
+    }
+    return result.toString();
+  }
 
   private final Report report;
   private final EPUBVersion version;
   private final EPUBLocation location;
+  private final String filename;
+
+  public OCFFilenameChecker(String filename, ValidationContext context)
+  {
+    this(filename, context, null);
+  }
 
-  public OCFFilenameChecker(ValidationContext context)
+  public OCFFilenameChecker(String filename, ValidationContext context, EPUBLocation location)
   {
+    Preconditions.checkArgument(filename != null);
+    Preconditions.checkArgument(context != null);
+    this.filename = filename;
     this.report = context.report;
     this.version = context.version;
-    this.location = EPUBLocation.of(context);
+    this.location = (location != null) ? location : EPUBLocation.of(context);
   }
 
-  public String checkCompatiblyEscaped(final String str)
+  @Override
+  public void check()
   {
-    // don't check remote resources
-    if (str.matches("^[^:/?#]+://.*"))
-    {
-      return "";
-    }
-
-    // the test string will be used to compare test result
-    String test = checkNonAsciiFilename(str);
-
-    if (str.endsWith("."))
-    {
-      report.message(MessageId.PKG_011, location, str);
-      test += ".";
-    }
-
-    boolean spaces = false;
-    final char[] ascciGraphic = new char[] { '<', '>', '"', '{', '}', '|', '^', '`', '*',
-        '?' /* , ':','/', '\\' */ };
-    String result = "";
-    char[] chars = str.toCharArray();
-    for (char c : chars)
+    // Iterate through the code points to search disallowed characters
+    UCharacterIterator chars = UCharacterIterator.getInstance(filename);
+    final Set<String> disallowed = new LinkedHashSet<>();
+    boolean hasSpaces = false;
+    boolean isASCIIOnly = true;
+    int codepoint;
+    while ((codepoint = chars.nextCodePoint()) != UForwardCharacterIterator.DONE)
     {
-      for (char a : ascciGraphic)
+      // Check if the string has non-ASCII characters
+      isASCIIOnly = isASCIIOnly && ASCII.contains(codepoint);
+      // Check if the string has space characters
+      hasSpaces = hasSpaces || UCharacter.isUWhiteSpace(codepoint);
+      // Check for disallowed characters
+      switch (version)
       {
-        if (c == a)
+      case VERSION_2:
+        if (DISALLOWED_EPUB2.contains(codepoint))
         {
-          result += "\"" + Character.toString(c) + "\",";
-          test += Character.toString(c);
+          disallowed.add(toString(codepoint, "ASCII"));
         }
-      }
-      if (Character.isSpaceChar(c))
-      {
-        spaces = true;
-        test += Character.toString(c);
+        break;
+      default:
+        for (String name : DISALLOWED_EPUB3.keySet())
+        {
+          if (DISALLOWED_EPUB3.get(name).contains(codepoint))
+          {
+            disallowed.add(toString(codepoint, name));
+            break;
+          }
+        }
+        break;
       }
     }
-    if (result.length() > 1)
+    // Check that FULL STOP is not used as the last character
+    if (chars.previousCodePoint() == 0x002E)
     {
-      result = result.substring(0, result.length() - 1);
-      report.message(MessageId.PKG_009, location, str, result);
+      report.message(MessageId.PKG_011, location, filename);
     }
-    if (spaces)
+    // Report if disallowed characters were found
+    if (!disallowed.isEmpty())
     {
-      report.message(MessageId.PKG_010, location, str);
+      report.message(MessageId.PKG_009, location, filename,
+          disallowed.stream().collect(Collectors.joining(", ")));
     }
-
-    if (version == EPUBVersion.VERSION_3)
+    // Report whitespace characters
+    if (hasSpaces)
     {
-      checkCompatiblyEscaped30(str, test);
+      report.message(MessageId.PKG_010, location, filename);
     }
-    return test;
-  }
-
-  private String checkNonAsciiFilename(final String str)
-  {
-    String nonAscii = str.replaceAll("[\\p{ASCII}]", "");
-    if (nonAscii.length() > 0)
+    // Report non-ASCII characters as usage
+    if (!isASCIIOnly)
     {
-      report.message(MessageId.PKG_012, location, str, nonAscii);
+      report.message(MessageId.PKG_012, location, filename);
     }
-    return nonAscii;
   }
 
-  private String checkCompatiblyEscaped30(String str, String test)
-  {
-    String result = "";
-
-    char[] chars = str.toCharArray();
-    for (char c : chars)
-    {
-      if (Character.isISOControl(c))
-      {
-        result += "\"" + Character.toString(c) + "\",";
-        test += Character.toString(c);
-      }
-
-      // DEL (U+007F)
-      if (c == '\u007F')
-      {
-        result += "\"" + Character.toString(c) + "\",";
-        test += Character.toString(c);
-      }
-      String unicodeType = Character.UnicodeBlock.of(c).toString();
-      if (RESTRICTED_30_CHARACTER_SET.contains(unicodeType))
-      {
-        result += "\"" + Character.toString(c) + "\",";
-      }
-    }
-    if (result.length() > 1)
-    {
-      result = result.substring(0, result.length() - 1);
-      report.message(MessageId.PKG_009, location, str, result);
-    }
-    return test;
-  }
 }
diff --git a/src/main/java/com/adobe/epubcheck/opf/OPFChecker.java b/src/main/java/com/adobe/epubcheck/opf/OPFChecker.java
@@ -207,16 +207,12 @@ protected boolean checkContent()
 
     for (OPFItem item : opfHandler.getItems())
     {
-      // only check Filename CompatiblyEscaped when in "-mode opf"
-      // this is when 'xrefChecker' Object is null which is an indicator for
-      // single file validation
-      // (Had no better possibility in mind since "mode" isn't available in
-      // OPFChecker.java)
-      //
-      // bugfix for issue 239
-      if (!context.xrefChecker.isPresent())
+      // only check the filename in single-file mode
+      // (it is checked by the container checker in full-publication mode)
+      // and for local resources (i.e. computed to a file URL)
+      if (!context.container.isPresent() && !item.isRemote())
       {
-        new OCFFilenameChecker(context).checkCompatiblyEscaped(item.getPath());
+        new OCFFilenameChecker(item.getPath(), context, item.getLocation()).check();
       }
       if (!item.equals(opfHandler.getItemByURL(item.getURL()).orNull()))
       {

diff --git a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties
@@ -297,10 +297,10 @@ PKG_005=The mimetype file has an extra field of length %1$s. The use of the extr
 PKG_006=Mimetype file entry is missing or is not the first file in the archive.
 PKG_007=Mimetype file should only contain the string "application/epub+zip" and should not be compressed.
 PKG_008=Unable to read file "%1$s".
-PKG_009=File name contains characters that are not allowed in OCF file names: "%1$s".
-PKG_010=Filename contains spaces, therefore URI escaping is necessary. Consider removing spaces from filename.
-PKG_011=Filename is not allowed to end with ".".
-PKG_012=File name contains the following non-ascii characters: %1$s. Consider changing the filename.
+PKG_009=The file name "%1$s" contains characters that are not allowed in OCF file names: %2$s.
+PKG_010=The file name "%1$s" contains spaces, which may create interoperability issues with older reading systems.
+PKG_011=The file name "%1$s" is not allowed to end with ".".
+PKG_012=The file name "%1$s" contains non-ASCII characters, which might create interoperability issues with older reading systems.
 PKG_013=The EPUB file includes multiple OPS renditions.
 PKG_014=The EPUB contains empty directory "%1$s".
 PKG_015=Unable to read EPUB contents: %1$s