Merge pull request #546 from diffplug/eclipse_wtp_encoding

Fix Eclipse WTP encoding handling
diffplug · Mar 26, 2020 · b78fc1b · b78fc1b
2 parents 8aab108 + 08d4e4e
commit b78fc1b
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 7 deletions.
diff --git a/_ext/eclipse-wtp/CHANGES.md b/_ext/eclipse-wtp/CHANGES.md
@@ -3,6 +3,11 @@
 We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `3.15.1`).
 
 ## [Unreleased]
+### Fixed
+* Handling of character encodings on OS with non UTF-8 default file encoding format. 
+CSS, HTML and JSON formatter steps encoded intermediately the input
+to UTF-8 and used the default file encoding format for decoding. This process failed
+if the input contained characters not bit-equivalent in UTF-8 and OS default file encoding format ([#545](https://github.com/diffplug/spotless/issues/545)).
 
 ## [3.15.2] - 2020-03-04
 ### Fixed

diff --git a/...pse-wtp/src/main/java/com/diffplug/spotless/extra/eclipse/wtp/sse/ContentTypeManager.java b/...pse-wtp/src/main/java/com/diffplug/spotless/extra/eclipse/wtp/sse/ContentTypeManager.java
@@ -36,19 +36,34 @@
 import com.diffplug.spotless.extra.eclipse.base.service.NoContentTypeSpecificHandling;
 
 /**
- * For some embedded formatters, the WTP uses the content type ID for
- * preferences lookup.
+
+ * WTP ModelHandlerRegistry uses the content type mamanger clean-up formatters
+ * to provide association of content to content type related functionality.
  * <p>
- * The preference lookup is accomplished via the Eclipse preference service,
- * which must be provided in combination with this service.
- * For cleanup tasks, the ID mapping is also used by the model handler
- * to determine the model which a string stream requires.
+ * Preference lookup per content type is accomplished via the
+ * Eclipse PreferencesService, which must be provided in combination with
+ * this service.
  * </p>
+ * The input byte steam encoding detection is accomplished by the
+ * content type manager. Normally the encoding is bount do a document/file.
+ * Spotless applies the formatting on strings already decoded.
+ * The WTP AbstractStructuredCleanupProcessor provides for non-documents
+ * a clean-up function converting the decoded string into an UTF-8 encoded byte stream.
+ * WTP AbstractDocumentLoader uses content type mamanger to determine the encoding
+ * of the input stream.
+ * Only the steps are affected that are using the
+ * AbstractStructuredCleanupProcessor. All other steps creating an empty document
+ * (e.g. via WTP AbstractDocumentLoader) and setting the textual content of the new document.
+ *
+ * @see org.eclipse.core.internal.preferences.PreferencesService
+ * @see org.eclipse.wst.sse.core.internal.cleanup.AbstractStructuredCleanupProcessor
+ * @see org.eclipse.wst.sse.core.internal.document.AbstractDocumentLoader
  * @see org.eclipse.wst.sse.core.internal.modelhandler.ModelHandlerRegistry
  */
 class ContentTypeManager extends NoContentTypeSpecificHandling {
 	private final Map<String, IContentType> id2Object;
 	private final IContentType processorStepType;
+	private final IContentDescription processorStepDescription;
 
 	/**
 	 * Content type manager as required for cleanup steps.
@@ -66,6 +81,7 @@ class ContentTypeManager extends NoContentTypeSpecificHandling {
 		if (null == processorStepType) {
 			throw new IllegalArgumentException("The manager does not support content type " + formatterContentTypeID);
 		}
+		processorStepDescription = new StringDescription(processorStepType);
 	}
 
 	@Override
@@ -83,8 +99,48 @@ public IContentType findContentTypeFor(InputStream contents, String fileName) th
 		return processorStepType;
 	}
 
+	@Override
+	public IContentDescription getDescriptionFor(InputStream contents, String fileName, QualifiedName[] options) throws IOException {
+		return processorStepDescription;
+	}
+
+	private static class StringDescription implements IContentDescription {
+
+		private final IContentType type;
+
+		public StringDescription(IContentType type) {
+			this.type = type;
+		}
+
+		@Override
+		public boolean isRequested(QualifiedName key) {
+			return false; //Don't use set Property
+		}
+
+		@Override
+		public String getCharset() {
+			//Called by AbstractDocumentLoader.readInputStream
+			return "UTF-8"; //UTF-8 encoded by AbstractStructuredCleanupProcessor.cleanupContent
+		}
+
+		@Override
+		public IContentType getContentType() {
+			return type;
+		}
+
+		@Override
+		public Object getProperty(QualifiedName key) {
+			return null; //Assume that the property map is empty
+		}
+
+		@Override
+		public void setProperty(QualifiedName key, Object value) {
+			throw new IllegalArgumentException("Content description key cannot be set: " + key);
+		}
+	}
+
 	/**
-	 * The WTP uses the manager only for ID mapping, so most of the methods are not used.
+	 * The WTP uses the manager mainly for ID mapping, so most of the methods are not used.
 	 * Actually it has a hand stitched way for transforming the content type ID
 	 * {@code org.eclipse.wst...source} to the plugin ID {@code org.eclipse.wst...core}.
 	 * @see org.eclipse.wst.sse.core.internal.encoding.ContentBasedPreferenceGateway

diff --git a/...c/test/java/com/diffplug/spotless/extra/eclipse/wtp/EclipseHtmlFormatterStepImplTest.java b/...c/test/java/com/diffplug/spotless/extra/eclipse/wtp/EclipseHtmlFormatterStepImplTest.java
@@ -89,6 +89,26 @@ public void formatCSS() throws Exception {
 				testData.expected("css.html"), output);
 	}
 
+	@Test
+	public void checkCleanupForNonUtf8() throws Exception {
+		String osEncoding = System.getProperty("file.encoding");
+		System.setProperty("file.encoding", "ISO-8859-1"); //Simulate a non UTF-8 OS
+		String[] input = testData.input("utf-8.html");
+		String output = formatter.format(input[0]);
+		System.setProperty("file.encoding", osEncoding);
+		assertEquals("Unexpected formatting of UTF-8", testData.expected("utf-8.html"), output);
+	}
+
+	@Test
+	public void checkBOMisStripped() throws Exception {
+		String[] input = testData.input("bom.html");
+		String[] inputWithoutBom = testData.input("utf-8.html");
+		//The UTF-8 BOM is interpreted as on UTF-16 character.
+		assertEquals("BOM input invalid", input[0].length() - 1, inputWithoutBom[0].length());
+		String output = formatter.format(input[0]);
+		assertEquals("BOM is not stripped", testData.expected("utf-8.html"), output);
+	}
+
 	@Test(expected = IllegalArgumentException.class)
 	public void configurationChange() throws Exception {
 		new EclipseHtmlFormatterStepImpl(new Properties());

diff --git a/_ext/eclipse-wtp/src/test/resources/html/expected/utf-8.html b/_ext/eclipse-wtp/src/test/resources/html/expected/utf-8.html
@@ -0,0 +1,7 @@
+<!DOCTYPE html>
+<HTML>
+<HEAD>
+<META charset="UTF-8">
+<TITLE>ÄÜ€</TITLE>
+</HEAD>
+</HTML>
diff --git a/_ext/eclipse-wtp/src/test/resources/html/input/bom.html b/_ext/eclipse-wtp/src/test/resources/html/input/bom.html
@@ -0,0 +1,2 @@
+<!DOCTYPE html>
+<html><head><meta charset="UTF-8"><title>ÄÜ€</title></head></html>
diff --git a/_ext/eclipse-wtp/src/test/resources/html/input/utf-8.html b/_ext/eclipse-wtp/src/test/resources/html/input/utf-8.html
@@ -0,0 +1,2 @@
+<!DOCTYPE html>
+<html><head><meta charset="UTF-8"><title>ÄÜ€</title></head></html>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		<!DOCTYPE html>
		<html><head><meta charset="UTF-8"><title>ÄÜ€</title></head></html>