Skip to content

Commit

Permalink
Drop superseded encoding-checking code
Browse files Browse the repository at this point in the history
This change drops some code which performs various encoding checks that
no longer correspond to any current requirements in the Encoding spec.
  • Loading branch information
sideshowbarker committed Sep 13, 2020
1 parent 343d86f commit 5cfabbe
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 130 deletions.
27 changes: 0 additions & 27 deletions src/nu/validator/htmlparser/io/Driver.java
Original file line number Diff line number Diff line change
Expand Up @@ -453,33 +453,6 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
protected Encoding whineAboutEncodingAndReturnActual(String encoding,
Encoding cs) throws SAXException {
String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
if (encoding.startsWith("x-")) {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding. (Charmod C022)");
} else {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
}
} else if (!canonName.equals(encoding)) {
tokenizer.err("The encoding \u201C"
+ encoding
+ "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+ canonName + "\u201D. (Charmod C024)");
}
if (cs.isShouldNot()) {
tokenizer.warn("Authors should not use the character encoding \u201C"
+ encoding
+ "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isLikelyEbcdic()) {
tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
tokenizer.warn("The character encoding \u201C"
+ encoding
+ "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
}
if (!canonName.equals(encoding)) {
tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName));
}
Expand Down
81 changes: 0 additions & 81 deletions src/nu/validator/htmlparser/io/Encoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,39 +52,13 @@ public class Encoding {

public static final Encoding WINDOWS1252;

private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" };

private static String[] BANNED = { "bocu1", "cesu8", "compoundtext",
"iscii91", "macarabic", "maccentraleurroman", "maccroatian",
"maccyrillic", "macdevanagari", "macfarsi", "macgreek",
"macgujarati", "macgurmukhi", "machebrew", "macicelandic",
"macroman", "macromanian", "macthai", "macturkish", "macukranian",
"scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname",
"xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom",
"xutf32lebom", "xutf16oppositeendian", "xutf16platformendian",
"xutf32oppositeendian", "xutf32platformendian" };
private static Map<String, Encoding> encodingByLabel =
new HashMap<String, Encoding>();

private static String[] NOT_OBSCURE = { "big5", "big5hkscs", "eucjp",
"euckr", "gb18030", "gbk", "iso2022jp", "iso2022kr", "iso88591",
"iso885913", "iso885915", "iso88592", "iso88593", "iso88594",
"iso88595", "iso88596", "iso88597", "iso88598", "iso88599",
"koi8r", "shiftjis", "tis620", "usascii", "utf16", "utf16be",
"utf16le", "utf8", "windows1250", "windows1251", "windows1252",
"windows1253", "windows1254", "windows1255", "windows1256",
"windows1257", "windows1258" };

private final String canonName;

private final Charset charset;

private final boolean obscure;

private final boolean shouldNot;

private final boolean likelyEbcdic;

static {
Set<Encoding> encodings = new HashSet<Encoding>();

Expand Down Expand Up @@ -119,30 +93,6 @@ asciiSuperset, isObscure(name), isShouldNot(name),
WINDOWS1252 = forName("windows-1252");
}

private static boolean isObscure(String lowerCasePreferredIanaName) {
return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
}

private static boolean isBanned(String lowerCasePreferredIanaName) {
if (lowerCasePreferredIanaName.startsWith("xibm")) {
return true;
}
return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1);
}

private static boolean isShouldNot(String lowerCasePreferredIanaName) {
return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
}

private static boolean isLikelyEbcdic(String canonName,
boolean asciiSuperset) {
if (!asciiSuperset) {
return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm"));
} else {
return false;
}
}

public static Encoding forName(String name) {
Encoding rv = encodingByLabel.get(toNameKey(name));
if (rv == null) {
Expand Down Expand Up @@ -216,37 +166,6 @@ public String getCanonName() {
return canonName;
}

/**
* Returns the likelyEbcdic.
*
* @return the likelyEbcdic
*/
public boolean isLikelyEbcdic() {
return likelyEbcdic;
}

/**
* Returns the obscure.
*
* @return the obscure
*/
public boolean isObscure() {
return obscure;
}

/**
* Returns the shouldNot.
*
* @return the shouldNot
*/
public boolean isShouldNot() {
return shouldNot;
}

public boolean isRegistered() {
return !canonName.startsWith("x-");
}

/**
* @return
* @see java.nio.charset.Charset#canEncode()
Expand Down
22 changes: 0 additions & 22 deletions src/nu/validator/htmlparser/io/MetaSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,28 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException {
} else {
Encoding cs = Encoding.forName(encoding);
String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
if (encoding.startsWith("x-")) {
err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding. (Charmod C022)");
} else {
err("The encoding \u201C"
+ encoding
+ "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
}
} else if (!cs.getCanonName().equals(encoding)) {
err("The encoding \u201C" + encoding
+ "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+ canonName + "\u201D. (Charmod C024)");
}
if (cs.isShouldNot()) {
warn("Authors should not use the character encoding \u201C"
+ encoding
+ "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
}
if (!cs.getCanonName().equals(encoding)) {
err(Encoding.msgNotCanonicalName(encoding, canonName));
this.characterEncoding = cs;
Expand Down

0 comments on commit 5cfabbe

Please sign in to comment.