From 5cfabbee4425686fe2cbf9bcb640d1ad7da2995c Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 20:51:51 +0900 Subject: [PATCH] Drop superseded encoding-checking code This change drops some code which performs various encoding checks that no longer correspond to any current requirements in the Encoding spec. --- src/nu/validator/htmlparser/io/Driver.java | 27 ------- src/nu/validator/htmlparser/io/Encoding.java | 81 ------------------- .../validator/htmlparser/io/MetaSniffer.java | 22 ----- 3 files changed, 130 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index 433517a5..4e2ebe75 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -453,33 +453,6 @@ protected Encoding encodingFromExternalDeclaration(String encoding) protected Encoding whineAboutEncodingAndReturnActual(String encoding, Encoding cs) throws SAXException { String canonName = cs.getCanonName(); - if (!cs.isRegistered()) { - if (encoding.startsWith("x-")) { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding. (Charmod C022)"); - } else { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); - } - } else if (!canonName.equals(encoding)) { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" - + canonName + "\u201D. (Charmod C024)"); - } - if (cs.isShouldNot()) { - tokenizer.warn("Authors should not use the character encoding \u201C" - + encoding - + "\u201D. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isLikelyEbcdic()) { - tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isObscure()) { - tokenizer.warn("The character encoding \u201C" - + encoding - + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); - } if (!canonName.equals(encoding)) { tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName)); } diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index c167e96b..b5085113 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -52,39 +52,13 @@ public class Encoding { public static final Encoding WINDOWS1252; - private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" }; - - private static String[] BANNED = { "bocu1", "cesu8", "compoundtext", - "iscii91", "macarabic", "maccentraleurroman", "maccroatian", - "maccyrillic", "macdevanagari", "macfarsi", "macgreek", - "macgujarati", "macgurmukhi", "machebrew", "macicelandic", - "macroman", "macromanian", "macthai", "macturkish", "macukranian", - "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname", - "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom", - "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian", - "xutf32oppositeendian", "xutf32platformendian" }; private static Map encodingByLabel = new HashMap(); - private static String[] NOT_OBSCURE = { "big5", "big5hkscs", "eucjp", - "euckr", "gb18030", "gbk", "iso2022jp", "iso2022kr", "iso88591", - "iso885913", "iso885915", "iso88592", "iso88593", "iso88594", - "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", - "koi8r", "shiftjis", "tis620", "usascii", "utf16", "utf16be", - "utf16le", "utf8", "windows1250", "windows1251", "windows1252", - "windows1253", "windows1254", "windows1255", "windows1256", - "windows1257", "windows1258" }; - private final String canonName; private final Charset charset; - private final boolean obscure; - - private final boolean shouldNot; - - private final boolean likelyEbcdic; - static { Set encodings = new HashSet(); @@ -119,30 +93,6 @@ asciiSuperset, isObscure(name), isShouldNot(name), WINDOWS1252 = forName("windows-1252"); } - private static boolean isObscure(String lowerCasePreferredIanaName) { - return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1); - } - - private static boolean isBanned(String lowerCasePreferredIanaName) { - if (lowerCasePreferredIanaName.startsWith("xibm")) { - return true; - } - return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1); - } - - private static boolean isShouldNot(String lowerCasePreferredIanaName) { - return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1); - } - - private static boolean isLikelyEbcdic(String canonName, - boolean asciiSuperset) { - if (!asciiSuperset) { - return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm")); - } else { - return false; - } - } - public static Encoding forName(String name) { Encoding rv = encodingByLabel.get(toNameKey(name)); if (rv == null) { @@ -216,37 +166,6 @@ public String getCanonName() { return canonName; } - /** - * Returns the likelyEbcdic. - * - * @return the likelyEbcdic - */ - public boolean isLikelyEbcdic() { - return likelyEbcdic; - } - - /** - * Returns the obscure. - * - * @return the obscure - */ - public boolean isObscure() { - return obscure; - } - - /** - * Returns the shouldNot. - * - * @return the shouldNot - */ - public boolean isShouldNot() { - return shouldNot; - } - - public boolean isRegistered() { - return !canonName.startsWith("x-"); - } - /** * @return * @see java.nio.charset.Charset#canEncode() diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index e8e7018e..600c1e72 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -169,28 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException { } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); - if (!cs.isRegistered()) { - if (encoding.startsWith("x-")) { - err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding. (Charmod C022)"); - } else { - err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); - } - } else if (!cs.getCanonName().equals(encoding)) { - err("The encoding \u201C" + encoding - + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" - + canonName + "\u201D. (Charmod C024)"); - } - if (cs.isShouldNot()) { - warn("Authors should not use the character encoding \u201C" - + encoding - + "\u201D. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isObscure()) { - warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); - } if (!cs.getCanonName().equals(encoding)) { err(Encoding.msgNotCanonicalName(encoding, canonName)); this.characterEncoding = cs;