Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve checking of OCF file name characters #1408

Merged
merged 2 commits into from
Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>70.1</version>
<version>72.1</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
Expand Down
44 changes: 24 additions & 20 deletions src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
import java.io.IOException;
import java.io.InputStream;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.w3c.epubcheck.constants.MIMEType;
import org.w3c.epubcheck.core.AbstractChecker;
Expand Down Expand Up @@ -281,14 +282,13 @@ private boolean checkContainerStructure(OCFCheckerState state)
// FIXME 2022 build resourcesProvider depending on MIME type
// Get a container
Iterable<OCFResource> resourcesProvider = new OCFZipResources(context.url);
// Map to store the container resource files
Map<String, OCFResource> resources = new HashMap<>();
// List to store the container resource directories
List<String> directories = new LinkedList<>();
// Set to store the normalized paths for duplicate checks
final Set<String> normalizedPaths = new HashSet<>();
// Lists to store the container entries for later empty directory check
final List<String> filePaths = new LinkedList<>();
final List<String> directoryPaths = new LinkedList<>();

// Loop through the entries
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
// FIXME catch IAE MALFORMED entries
for (OCFResource resource : resourcesProvider)
{
Preconditions.checkNotNull(resource.getPath());
Expand All @@ -297,12 +297,12 @@ private boolean checkContainerStructure(OCFCheckerState state)
// FIXME 2022 report symbolic links and continue

// Check duplicate entries
if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT)))
if (normalizedPaths.contains(resource.getPath().toLowerCase(Locale.ROOT)))
{
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
}
// Check duplicate entries after NFC normalization
else if (resources.containsKey(
else if (normalizedPaths.contains(
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
{
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
Expand All @@ -312,38 +312,42 @@ else if (resources.containsKey(
if (resource.isDirectory())
{
// the container resource is a directory,
// store it for later checking of empty directories
directories.add(resource.getPath());
// store its path for later checking of empty directories
directoryPaths.add(resource.getPath());
}
else
{
// The container resource is a file,
// sStore its path for later checking of empty directories
filePaths.add(resource.getPath());
normalizedPaths.add(resource.getPath().toLowerCase(Locale.ROOT));

// Check file name requirements
filenameChecker.checkCompatiblyEscaped(resource.getPath());
new OCFFilenameChecker(resource.getPath(), state.context().build()).check();

// report entry metadata
// Report entry metadata
reportFeatures(resource.getProperties());
// the container resource is a file,
// add the resource to the container model
resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource);

// Add the resource to the container model
state.addResource(resource);
}
}

// Report empty directories
for (String directory : directories)
for (String directoryPath : directoryPaths)
{
boolean hasContents = false;
for (OCFResource resource : resources.values())
for (String filePath : filePaths)
{
if (resource.getPath().startsWith(directory))
if (filePath.startsWith(directoryPath))
{
hasContents = true;
break;
}
}
if (!hasContents)
{
report.message(MessageId.PKG_014, EPUBLocation.of(context), directory);
report.message(MessageId.PKG_014, EPUBLocation.of(context), directoryPath);
}
}
return true;
Expand Down
1 change: 0 additions & 1 deletion src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
Expand Down
211 changes: 123 additions & 88 deletions src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java
Original file line number Diff line number Diff line change
@@ -1,127 +1,162 @@
package com.adobe.epubcheck.ocf;

import java.util.LinkedHashSet;
import java.util.Set;
import java.util.stream.Collectors;

import org.w3c.epubcheck.core.Checker;

import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.api.Report;
import com.adobe.epubcheck.messages.MessageId;
import com.adobe.epubcheck.opf.ValidationContext;
import com.adobe.epubcheck.util.EPUBVersion;
import com.google.common.collect.ImmutableSet;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UForwardCharacterIterator;
import com.ibm.icu.text.UnicodeSet;

//FIXME 2022 update related PKG-* messages to contain the file name string
public final class OCFFilenameChecker
public final class OCFFilenameChecker implements Checker
{
private static final Set<String> RESTRICTED_30_CHARACTER_SET = ImmutableSet.of("PRIVATE_USE_AREA",
"ARABIC_PRESENTATION_FORMS_A", "SPECIALS", "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
"SUPPLEMENTARY_PRIVATE_USE_AREA_B", "VARIATION_SELECTORS_SUPPLEMENT", "TAGS");

private static final UnicodeSet ASCII = new UnicodeSet("[:ascii:]").freeze();

private static final UnicodeSet DISALLOWED_EPUB2 = new UnicodeSet()
// .add(0x002F)// SOLIDUS '/' -- allowed as path separator
.add(0x0022)// QUOTATION MARK '"'
.add(0x002A)// ASTERISK '*'
// .add(0x002E)// FULL STOP '.' -- only disallowed as the last character
.add(0x003A)// COLON ':'
.add(0x003C)// LESS-THAN SIGN '<'
.add(0x003E)// GREATER-THAN SIGN '>'
.add(0x003F)// QUESTION MARK '?'
.add(0x005C)// REVERSE SOLIDUS '\'
.freeze();

private static final ImmutableMap<String, UnicodeSet> DISALLOWED_EPUB3 = new ImmutableMap.Builder<String, UnicodeSet>()
.put("ASCII", new UnicodeSet() //
.addAll(DISALLOWED_EPUB2)// all disallowed in EPUB 2.0.1
.add(0x007C) // VERTICAL LINE '|'
.freeze())
.put("NON CHARACTER", new UnicodeSet("[:Noncharacter_Code_Point=Yes:]")//
.freeze())
.put("CONTROL", new UnicodeSet().add(0x007F) // DEL
.addAll(0x0000, 0x001F) // C0 range
.addAll(0x0080, 0x009F) // C1 range
.freeze())
.put("PRIVATE USE", new UnicodeSet() //
.addAll(0xE000, 0xF8FF) // Private Use Area
.addAll(0xF0000, 0xFFFFF) // Supplementary Private Use Area-A
.addAll(0x100000, 0x10FFFF) // Supplementary Private Use Area-B
.freeze())
.put("SPECIALS", new UnicodeSet() //
.addAll(0xFFF0, 0xFFFF) // Specials Blocks
.freeze())
.put("DEPRECATED", new UnicodeSet() //
.add(0xE0001)// LANGUAGE TAG
// .add(0xE007F)// CANCEL TAG -- reinstated in Emoji tag sequences
.freeze())
.build();

private static String toString(int codepoint, String setName)
{
assert setName != null;
StringBuilder result = new StringBuilder().append(String.format("U+%04X ", codepoint));
if ("ASCII".equals(setName))
{
result.append('(').append(UCharacter.toString(codepoint)).append(')');
}
else
{
String characterName = UCharacter.getName(codepoint);
if (characterName != null)
{
result.append(characterName).append(' ');
}
result.append('(').append(setName).append(')');
}
return result.toString();
}

private final Report report;
private final EPUBVersion version;
private final EPUBLocation location;
private final String filename;

public OCFFilenameChecker(String filename, ValidationContext context)
{
this(filename, context, null);
}

public OCFFilenameChecker(ValidationContext context)
public OCFFilenameChecker(String filename, ValidationContext context, EPUBLocation location)
{
Preconditions.checkArgument(filename != null);
Preconditions.checkArgument(context != null);
this.filename = filename;
this.report = context.report;
this.version = context.version;
this.location = EPUBLocation.of(context);
this.location = (location != null) ? location : EPUBLocation.of(context);
}

public String checkCompatiblyEscaped(final String str)
@Override
public void check()
{
// don't check remote resources
if (str.matches("^[^:/?#]+://.*"))
{
return "";
}

// the test string will be used to compare test result
String test = checkNonAsciiFilename(str);

if (str.endsWith("."))
{
report.message(MessageId.PKG_011, location, str);
test += ".";
}

boolean spaces = false;
final char[] ascciGraphic = new char[] { '<', '>', '"', '{', '}', '|', '^', '`', '*',
'?' /* , ':','/', '\\' */ };
String result = "";
char[] chars = str.toCharArray();
for (char c : chars)
// Iterate through the code points to search disallowed characters
UCharacterIterator chars = UCharacterIterator.getInstance(filename);
final Set<String> disallowed = new LinkedHashSet<>();
boolean hasSpaces = false;
boolean isASCIIOnly = true;
int codepoint;
while ((codepoint = chars.nextCodePoint()) != UForwardCharacterIterator.DONE)
{
for (char a : ascciGraphic)
// Check if the string has non-ASCII characters
isASCIIOnly = isASCIIOnly && ASCII.contains(codepoint);
// Check if the string has space characters
hasSpaces = hasSpaces || UCharacter.isUWhiteSpace(codepoint);
// Check for disallowed characters
switch (version)
{
if (c == a)
case VERSION_2:
if (DISALLOWED_EPUB2.contains(codepoint))
{
result += "\"" + Character.toString(c) + "\",";
test += Character.toString(c);
disallowed.add(toString(codepoint, "ASCII"));
}
}
if (Character.isSpaceChar(c))
{
spaces = true;
test += Character.toString(c);
break;
default:
for (String name : DISALLOWED_EPUB3.keySet())
{
if (DISALLOWED_EPUB3.get(name).contains(codepoint))
{
disallowed.add(toString(codepoint, name));
break;
}
}
break;
}
}
if (result.length() > 1)
// Check that FULL STOP is not used as the last character
if (chars.previousCodePoint() == 0x002E)
{
result = result.substring(0, result.length() - 1);
report.message(MessageId.PKG_009, location, str, result);
report.message(MessageId.PKG_011, location, filename);
}
if (spaces)
// Report if disallowed characters were found
if (!disallowed.isEmpty())
{
report.message(MessageId.PKG_010, location, str);
report.message(MessageId.PKG_009, location, filename,
disallowed.stream().collect(Collectors.joining(", ")));
}

if (version == EPUBVersion.VERSION_3)
// Report whitespace characters
if (hasSpaces)
{
checkCompatiblyEscaped30(str, test);
report.message(MessageId.PKG_010, location, filename);
}
return test;
}

private String checkNonAsciiFilename(final String str)
{
String nonAscii = str.replaceAll("[\\p{ASCII}]", "");
if (nonAscii.length() > 0)
// Report non-ASCII characters as usage
if (!isASCIIOnly)
{
report.message(MessageId.PKG_012, location, str, nonAscii);
report.message(MessageId.PKG_012, location, filename);
}
return nonAscii;
}

private String checkCompatiblyEscaped30(String str, String test)
{
String result = "";

char[] chars = str.toCharArray();
for (char c : chars)
{
if (Character.isISOControl(c))
{
result += "\"" + Character.toString(c) + "\",";
test += Character.toString(c);
}

// DEL (U+007F)
if (c == '\u007F')
{
result += "\"" + Character.toString(c) + "\",";
test += Character.toString(c);
}
String unicodeType = Character.UnicodeBlock.of(c).toString();
if (RESTRICTED_30_CHARACTER_SET.contains(unicodeType))
{
result += "\"" + Character.toString(c) + "\",";
}
}
if (result.length() > 1)
{
result = result.substring(0, result.length() - 1);
report.message(MessageId.PKG_009, location, str, result);
}
return test;
}
}
1 change: 0 additions & 1 deletion src/main/java/com/adobe/epubcheck/ocf/OCFMetaFile.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package com.adobe.epubcheck.ocf;

import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.opf.ValidationContext;
import com.google.common.base.Preconditions;

import io.mola.galimatias.GalimatiasParseException;
Expand Down
Loading