w3c · rdeltour · Dec 6, 2022 · Oct 28, 2022 · Oct 28, 2022
diff --git a/pom.xml b/pom.xml
@@ -173,7 +173,7 @@
         <dependency>
             <groupId>com.ibm.icu</groupId>
             <artifactId>icu4j</artifactId>
-            <version>70.1</version>
+            <version>72.1</version>
         </dependency>
         <dependency>
             <groupId>net.sf.saxon</groupId>

diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java
@@ -26,11 +26,12 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.Normalizer;
-import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 
 import org.w3c.epubcheck.constants.MIMEType;
 import org.w3c.epubcheck.core.AbstractChecker;
@@ -281,14 +282,13 @@ private boolean checkContainerStructure(OCFCheckerState state)
       // FIXME 2022 build resourcesProvider depending on MIME type
       // Get a container
       Iterable<OCFResource> resourcesProvider = new OCFZipResources(context.url);
-      // Map to store the container resource files
-      Map<String, OCFResource> resources = new HashMap<>();
-      // List to store the container resource directories
-      List<String> directories = new LinkedList<>();
+      // Set to store the normalized paths for duplicate checks
+      final Set<String> normalizedPaths = new HashSet<>();
+      // Lists to store the container entries for later empty directory check
+      final List<String> filePaths = new LinkedList<>();
+      final List<String> directoryPaths = new LinkedList<>();
 
       // Loop through the entries
-      OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
-      // FIXME catch IAE MALFORMED entries
       for (OCFResource resource : resourcesProvider)
       {
         Preconditions.checkNotNull(resource.getPath());
@@ -297,12 +297,12 @@ private boolean checkContainerStructure(OCFCheckerState state)
         // FIXME 2022 report symbolic links and continue
 
         // Check duplicate entries
-        if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT)))
+        if (normalizedPaths.contains(resource.getPath().toLowerCase(Locale.ROOT)))
         {
           context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
         }
         // Check duplicate entries after NFC normalization
-        else if (resources.containsKey(
+        else if (normalizedPaths.contains(
             Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
         {
           context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
@@ -312,38 +312,42 @@ else if (resources.containsKey(
         if (resource.isDirectory())
         {
           // the container resource is a directory,
-          // store it for later checking of empty directories
-          directories.add(resource.getPath());
+          // store its path for later checking of empty directories
+          directoryPaths.add(resource.getPath());
         }
         else
         {
+          // The container resource is a file,
+          // sStore its path for later checking of empty directories
+          filePaths.add(resource.getPath());
+          normalizedPaths.add(resource.getPath().toLowerCase(Locale.ROOT));
+
           // Check file name requirements
-          filenameChecker.checkCompatiblyEscaped(resource.getPath());
+          new OCFFilenameChecker(resource.getPath(), state.context().build()).check();
 
-          // report entry metadata
+          // Report entry metadata
           reportFeatures(resource.getProperties());
-          // the container resource is a file,
-          // add the resource to the container model
-          resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource);
+
+          // Add the resource to the container model
           state.addResource(resource);
         }
       }
 
       // Report empty directories
-      for (String directory : directories)
+      for (String directoryPath : directoryPaths)
       {
         boolean hasContents = false;
-        for (OCFResource resource : resources.values())
+        for (String filePath : filePaths)
         {
-          if (resource.getPath().startsWith(directory))
+          if (filePath.startsWith(directoryPath))
           {
             hasContents = true;
             break;
           }
         }
         if (!hasContents)
         {
-          report.message(MessageId.PKG_014, EPUBLocation.of(context), directory);
+          report.message(MessageId.PKG_014, EPUBLocation.of(context), directoryPath);
         }
       }
       return true;

diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java b/src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java
@@ -3,7 +3,6 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.LinkedHashMap;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.UUID;

diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java
@@ -1,127 +1,162 @@
 package com.adobe.epubcheck.ocf;
 
+import java.util.LinkedHashSet;
 import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.w3c.epubcheck.core.Checker;
 
 import com.adobe.epubcheck.api.EPUBLocation;
 import com.adobe.epubcheck.api.Report;
 import com.adobe.epubcheck.messages.MessageId;
 import com.adobe.epubcheck.opf.ValidationContext;
 import com.adobe.epubcheck.util.EPUBVersion;
-import com.google.common.collect.ImmutableSet;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableMap;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UCharacterIterator;
+import com.ibm.icu.text.UForwardCharacterIterator;
+import com.ibm.icu.text.UnicodeSet;
 
-//FIXME 2022 update related PKG-* messages to contain the file name string
-public final class OCFFilenameChecker
+public final class OCFFilenameChecker implements Checker
 {
-  private static final Set<String> RESTRICTED_30_CHARACTER_SET = ImmutableSet.of("PRIVATE_USE_AREA",
-      "ARABIC_PRESENTATION_FORMS_A", "SPECIALS", "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
-      "SUPPLEMENTARY_PRIVATE_USE_AREA_B", "VARIATION_SELECTORS_SUPPLEMENT", "TAGS");
+
+  private static final UnicodeSet ASCII = new UnicodeSet("[:ascii:]").freeze();
+
+  private static final UnicodeSet DISALLOWED_EPUB2 = new UnicodeSet()
+      // .add(0x002F)// SOLIDUS '/' -- allowed as path separator
+      .add(0x0022)// QUOTATION MARK '"'
+      .add(0x002A)// ASTERISK '*'
+      // .add(0x002E)// FULL STOP '.' -- only disallowed as the last character
+      .add(0x003A)// COLON ':'
+      .add(0x003C)// LESS-THAN SIGN '<'
+      .add(0x003E)// GREATER-THAN SIGN '>'
+      .add(0x003F)// QUESTION MARK '?'
+      .add(0x005C)// REVERSE SOLIDUS '\'
+      .freeze();
+
+  private static final ImmutableMap<String, UnicodeSet> DISALLOWED_EPUB3 = new ImmutableMap.Builder<String, UnicodeSet>()
+      .put("ASCII", new UnicodeSet() //
+          .addAll(DISALLOWED_EPUB2)// all disallowed in EPUB 2.0.1
+          .add(0x007C) // VERTICAL LINE '|'
+          .freeze())
+      .put("NON CHARACTER", new UnicodeSet("[:Noncharacter_Code_Point=Yes:]")//
+          .freeze())
+      .put("CONTROL", new UnicodeSet().add(0x007F) // DEL
+          .addAll(0x0000, 0x001F) // C0 range
+          .addAll(0x0080, 0x009F) // C1 range
+          .freeze())
+      .put("PRIVATE USE", new UnicodeSet() //
+          .addAll(0xE000, 0xF8FF) // Private Use Area
+          .addAll(0xF0000, 0xFFFFF) // Supplementary Private Use Area-A
+          .addAll(0x100000, 0x10FFFF) // Supplementary Private Use Area-B
+          .freeze())
+      .put("SPECIALS", new UnicodeSet() //
+          .addAll(0xFFF0, 0xFFFF) // Specials Blocks
+          .freeze())
+      .put("DEPRECATED", new UnicodeSet() //
+          .add(0xE0001)// LANGUAGE TAG
+          // .add(0xE007F)// CANCEL TAG -- reinstated in Emoji tag sequences
+          .freeze())
+      .build();
+
+  private static String toString(int codepoint, String setName)
+  {
+    assert setName != null;
+    StringBuilder result = new StringBuilder().append(String.format("U+%04X ", codepoint));
+    if ("ASCII".equals(setName))
+    {
+      result.append('(').append(UCharacter.toString(codepoint)).append(')');
+    }
+    else
+    {
+      String characterName = UCharacter.getName(codepoint);
+      if (characterName != null)
+      {
+        result.append(characterName).append(' ');
+      }
+      result.append('(').append(setName).append(')');
+    }
+    return result.toString();
+  }
 
   private final Report report;
   private final EPUBVersion version;
   private final EPUBLocation location;
+  private final String filename;
+
+  public OCFFilenameChecker(String filename, ValidationContext context)
+  {
+    this(filename, context, null);
+  }
 
-  public OCFFilenameChecker(ValidationContext context)
+  public OCFFilenameChecker(String filename, ValidationContext context, EPUBLocation location)
   {
+    Preconditions.checkArgument(filename != null);
+    Preconditions.checkArgument(context != null);
+    this.filename = filename;
     this.report = context.report;
     this.version = context.version;
-    this.location = EPUBLocation.of(context);
+    this.location = (location != null) ? location : EPUBLocation.of(context);
   }
 
-  public String checkCompatiblyEscaped(final String str)
+  @Override
+  public void check()
   {
-    // don't check remote resources
-    if (str.matches("^[^:/?#]+://.*"))
-    {
-      return "";
-    }
-
-    // the test string will be used to compare test result
-    String test = checkNonAsciiFilename(str);
-
-    if (str.endsWith("."))
-    {
-      report.message(MessageId.PKG_011, location, str);
-      test += ".";
-    }
-
-    boolean spaces = false;
-    final char[] ascciGraphic = new char[] { '<', '>', '"', '{', '}', '|', '^', '`', '*',
-        '?' /* , ':','/', '\\' */ };
-    String result = "";
-    char[] chars = str.toCharArray();
-    for (char c : chars)
+    // Iterate through the code points to search disallowed characters
+    UCharacterIterator chars = UCharacterIterator.getInstance(filename);
+    final Set<String> disallowed = new LinkedHashSet<>();
+    boolean hasSpaces = false;
+    boolean isASCIIOnly = true;
+    int codepoint;
+    while ((codepoint = chars.nextCodePoint()) != UForwardCharacterIterator.DONE)
     {
-      for (char a : ascciGraphic)
+      // Check if the string has non-ASCII characters
+      isASCIIOnly = isASCIIOnly && ASCII.contains(codepoint);
+      // Check if the string has space characters
+      hasSpaces = hasSpaces || UCharacter.isUWhiteSpace(codepoint);
+      // Check for disallowed characters
+      switch (version)
       {
-        if (c == a)
+      case VERSION_2:
+        if (DISALLOWED_EPUB2.contains(codepoint))
         {
-          result += "\"" + Character.toString(c) + "\",";
-          test += Character.toString(c);
+          disallowed.add(toString(codepoint, "ASCII"));
         }
-      }
-      if (Character.isSpaceChar(c))
-      {
-        spaces = true;
-        test += Character.toString(c);
+        break;
+      default:
+        for (String name : DISALLOWED_EPUB3.keySet())
+        {
+          if (DISALLOWED_EPUB3.get(name).contains(codepoint))
+          {
+            disallowed.add(toString(codepoint, name));
+            break;
+          }
+        }
+        break;
       }
     }
-    if (result.length() > 1)
+    // Check that FULL STOP is not used as the last character
+    if (chars.previousCodePoint() == 0x002E)
     {
-      result = result.substring(0, result.length() - 1);
-      report.message(MessageId.PKG_009, location, str, result);
+      report.message(MessageId.PKG_011, location, filename);
     }
-    if (spaces)
+    // Report if disallowed characters were found
+    if (!disallowed.isEmpty())
     {
-      report.message(MessageId.PKG_010, location, str);
+      report.message(MessageId.PKG_009, location, filename,
+          disallowed.stream().collect(Collectors.joining(", ")));
     }
-
-    if (version == EPUBVersion.VERSION_3)
+    // Report whitespace characters
+    if (hasSpaces)
     {
-      checkCompatiblyEscaped30(str, test);
+      report.message(MessageId.PKG_010, location, filename);
     }
-    return test;
-  }
-
-  private String checkNonAsciiFilename(final String str)
-  {
-    String nonAscii = str.replaceAll("[\\p{ASCII}]", "");
-    if (nonAscii.length() > 0)
+    // Report non-ASCII characters as usage
+    if (!isASCIIOnly)
     {
-      report.message(MessageId.PKG_012, location, str, nonAscii);
+      report.message(MessageId.PKG_012, location, filename);
     }
-    return nonAscii;
   }
 
-  private String checkCompatiblyEscaped30(String str, String test)
-  {
-    String result = "";
-
-    char[] chars = str.toCharArray();
-    for (char c : chars)
-    {
-      if (Character.isISOControl(c))
-      {
-        result += "\"" + Character.toString(c) + "\",";
-        test += Character.toString(c);
-      }
-
-      // DEL (U+007F)
-      if (c == '\u007F')
-      {
-        result += "\"" + Character.toString(c) + "\",";
-        test += Character.toString(c);
-      }
-      String unicodeType = Character.UnicodeBlock.of(c).toString();
-      if (RESTRICTED_30_CHARACTER_SET.contains(unicodeType))
-      {
-        result += "\"" + Character.toString(c) + "\",";
-      }
-    }
-    if (result.length() > 1)
-    {
-      result = result.substring(0, result.length() - 1);
-      report.message(MessageId.PKG_009, location, str, result);
-    }
-    return test;
-  }
 }
diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFMetaFile.java b/src/main/java/com/adobe/epubcheck/ocf/OCFMetaFile.java
@@ -1,7 +1,6 @@
 package com.adobe.epubcheck.ocf;
 
 import com.adobe.epubcheck.api.EPUBLocation;
-import com.adobe.epubcheck.opf.ValidationContext;
 import com.google.common.base.Preconditions;
 
 import io.mola.galimatias.GalimatiasParseException;