ckeditor · Reinmar · Nov 27, 2018 · Oct 19, 2018 · Oct 19, 2018 · Oct 24, 2018
diff --git a/package.json b/package.json
@@ -16,9 +16,12 @@
   },
   "devDependencies": {
     "@ckeditor/ckeditor5-basic-styles": "^10.0.3",
+    "@ckeditor/ckeditor5-cloud-services": "^10.1.0",
+    "@ckeditor/ckeditor5-easy-image": "^10.0.3",
     "@ckeditor/ckeditor5-editor-classic": "^11.0.1",
     "@ckeditor/ckeditor5-enter": "^10.1.2",
     "@ckeditor/ckeditor5-heading": "^10.1.0",
+    "@ckeditor/ckeditor5-image": "^11.0.0",
     "@ckeditor/ckeditor5-link": "^10.0.4",
     "@ckeditor/ckeditor5-list": "^11.0.2",
     "@ckeditor/ckeditor5-paragraph": "^10.0.3",

diff --git a/src/filters/image.js b/src/filters/image.js
@@ -0,0 +1,206 @@
+/**
+ * @license Copyright (c) 2003-2018, CKSource - Frederico Knabben. All rights reserved.
+ * For licensing, see LICENSE.md.
+ */
+
+/**
+ * @module paste-from-office/filters/image
+ */
+
+import ViewMatcher from '@ckeditor/ckeditor5-engine/src/view/matcher';
+import UpcastWriter from '@ckeditor/ckeditor5-engine/src/view/upcastwriter';
+
+import { convertHexToBase64 } from './utils';
+
+/**
+ * Replaces source attribute of all `<img>` elements representing regular
+ * images (not the Word shapes) with inlined base64 image representation extracted from RTF or Blob data.
+ *
+ * @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment on which transform images.
+ * @param {String} rtfData The RTF data from which images representation will be used.
+ */
+export function replaceImagesSourceWithBase64( documentFragment, rtfData ) {
+	if ( !documentFragment.childCount ) {
+		return;
+	}
+
+	const upcastWriter = new UpcastWriter();
+	const shapesIds = findAllShapesIds( documentFragment, upcastWriter );
+
+	removeAllImgElementsRepresentingShapes( shapesIds, documentFragment, upcastWriter );
+	removeAllShapeElements( documentFragment, upcastWriter );
+
+	const images = findAllImageElementsWithLocalSource( documentFragment, upcastWriter );
+
+	if ( images.length ) {
+		replaceImagesFileSourceWithInlineRepresentation( images, extractImageDataFromRtf( rtfData ), upcastWriter );
+	}
+}
+
+// Finds all shapes (`<v:*>...</v:*>`) ids. Shapes can represent images (canvas)
+// or Word shapes (which does not have RTF or Blob representation).
+//
+// @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment
+// from which to extract shape ids.
+// @param {module:engine/view/upcastwriter~UpcastWriter} writer
+// @returns {Array.<String>} Array of shape ids.
+function findAllShapesIds( documentFragment, writer ) {
+	const range = writer.createRangeIn( documentFragment );
+
+	const shapeElementsMatcher = new ViewMatcher( {
+		name: /v:(.+)/
+	} );
+
+	const shapesIds = [];
+
+	for ( const value of range ) {
+		const el = value.item;
+		const prevSiblingName = el.previousSibling && el.previousSibling.name || null;
+
+		// If shape element have 'o:gfxdata' attribute and is not directly before `<v:shapetype>` element it means it represent Word shape.
+		if ( shapeElementsMatcher.match( el ) && el.getAttribute( 'o:gfxdata' ) && prevSiblingName !== 'v:shapetype' ) {
+			shapesIds.push( value.item.getAttribute( 'id' ) );
+		}
+	}
+
+	return shapesIds;
+}
+
+// Removes all `<img>` elements which represents Word shapes and not regular images.
+//
+// @param {Array.<String>} shapesIds Shape ids which will be checked against `<img>` elements.
+// @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment from which to remove `<img>` elements.
+// @param {module:engine/view/upcastwriter~UpcastWriter} writer
+function removeAllImgElementsRepresentingShapes( shapesIds, documentFragment, writer ) {
+	const range = writer.createRangeIn( documentFragment );
+
+	const imageElementsMatcher = new ViewMatcher( {
+		name: 'img'
+	} );
+
+	const imgs = [];
+
+	for ( const value of range ) {
+		if ( imageElementsMatcher.match( value.item ) ) {
+			const el = value.item;
+			const shapes = el.getAttribute( 'v:shapes' ) ? el.getAttribute( 'v:shapes' ).split( ' ' ) : [];
+
+			if ( shapes.length && shapes.every( shape => shapesIds.indexOf( shape ) > -1 ) ) {
+				imgs.push( el );
+			// Shapes may also have empty source while content is paste in some browsers (Safari).
+			} else if ( !el.getAttribute( 'src' ) ) {
+				imgs.push( el );
+			}
+		}
+	}
+
+	for ( const img of imgs ) {
+		writer.remove( img );
+	}
+}
+
+// Removes all shape elements (`<v:*>...</v:*>`) so they do not pollute the output structure.
+//
+// @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment from which to remove shape elements.
+// @param {module:engine/view/upcastwriter~UpcastWriter} writer
+function removeAllShapeElements( documentFragment, writer ) {
+	const range = writer.createRangeIn( documentFragment );
+
+	const shapeElementsMatcher = new ViewMatcher( {
+		name: /v:(.+)/
+	} );
+
+	const shapes = [];
+
+	for ( const value of range ) {
+		if ( shapeElementsMatcher.match( value.item ) ) {
+			shapes.push( value.item );
+		}
+	}
+
+	for ( const shape of shapes ) {
+		writer.remove( shape );
+	}
+}
+
+// Finds all `<img>` elements in a given document fragment which have source pointing to local `file://` resource.
+//
+// @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment in which to look for `<img>` elements.
+// @param {module:engine/view/upcastwriter~UpcastWriter} writer
+// @returns {Object} result All found images grouped by source type.
+// @returns {Array.<module:engine/view/element~Element>} result.file Array of found `<img>` elements with `file://` source.
+// @returns {Array.<module:engine/view/element~Element>} result.blob Array of found `<img>` elements with `blob:` source.
+function findAllImageElementsWithLocalSource( documentFragment, writer ) {
+	const range = writer.createRangeIn( documentFragment );
+
+	const imageElementsMatcher = new ViewMatcher( {
+		name: 'img'
+	} );
+
+	const imgs = [];
+
+	for ( const value of range ) {
+		if ( imageElementsMatcher.match( value.item ) ) {
+			if ( value.item.getAttribute( 'src' ).indexOf( 'file://' ) === 0 ) {
+				imgs.push( value.item );
+			}
+		}
+	}
+
+	return imgs;
+}
+
+// Extracts all images HEX representations from a given RTF data.
+//
+// @param {String} rtfData The RTF data from which to extract images HEX representation.
+// @returns {Array.<Object>} Array of found HEX representations. Each array item is an object containing:
+//
+// 		* {String} hex Image representation in HEX format.
+// 		* {string} type Type of image, `image/png` or `image/jpeg`.
+function extractImageDataFromRtf( rtfData ) {
+	if ( !rtfData ) {
+		return [];
+	}
+
+	const regexPictureHeader = /{\\pict[\s\S]+?\\bliptag-?\d+(\\blipupi-?\d+)?({\\\*\\blipuid\s?[\da-fA-F]+)?[\s}]*?/;
+	const regexPicture = new RegExp( '(?:(' + regexPictureHeader.source + '))([\\da-fA-F\\s]+)\\}', 'g' );
+	const images = rtfData.match( regexPicture );
+	const result = [];
+
+	if ( images ) {
+		for ( const image of images ) {
+			let imageType = false;
+
+			if ( image.indexOf( '\\pngblip' ) !== -1 ) {
+				imageType = 'image/png';
+			} else if ( image.indexOf( '\\jpegblip' ) !== -1 ) {
+				imageType = 'image/jpeg';
+			}
+
+			if ( imageType ) {
+				result.push( {
+					hex: image.replace( regexPictureHeader, '' ).replace( /[^\da-fA-F]/g, '' ),
+					type: imageType
+				} );
+			}
+		}
+	}
+
+	return result;
+}
+
+// Replaces `src` attribute value of all given images with the corresponding base64 image representation.
+//
+// @param {Array.<module:engine/view/element~Element>} imageElements Array of image elements which will have its source replaced.
+// @param {Array.<Object>} imagesHexSources Array of images hex sources (usually the result of `extractImageDataFromRtf()` function).
+// The array should be the same length as `imageElements` parameter.
+// @param {module:engine/view/upcastwriter~UpcastWriter} writer
+function replaceImagesFileSourceWithInlineRepresentation( imageElements, imagesHexSources, writer ) {
+	// Assume there is an equal amount of image elements and images HEX sources so they can be matched accordingly based on existing order.
+	if ( imageElements.length === imagesHexSources.length ) {
+		for ( let i = 0; i < imageElements.length; i++ ) {
+			const newSrc = `data:${ imagesHexSources[ i ].type };base64,${ convertHexToBase64( imagesHexSources[ i ].hex ) }`;
+			writer.setAttribute( 'src', newSrc, imageElements[ i ] );
+		}
+	}
+}
diff --git a/src/filters/list.js b/src/filters/list.js
@@ -21,21 +21,19 @@ import UpcastWriter from '@ckeditor/ckeditor5-engine/src/view/upcastwriter';
  *
  * @param {module:engine/view/documentfragment~DocumentFragment} documentFragment The view structure which to transform.
  * @param {String} stylesString Styles from which list-like elements styling will be extracted.
- * @param {module:engine/view/view~View} view
  */
-export function transformListItemLikeElementsIntoLists( documentFragment, stylesString, view ) {
+export function transformListItemLikeElementsIntoLists( documentFragment, stylesString ) {
 	if ( !documentFragment.childCount ) {
 		return;
 	}
 
-	const itemLikeElements = findAllItemLikeElements( documentFragment, view );
+	const writer = new UpcastWriter();
+	const itemLikeElements = findAllItemLikeElements( documentFragment, writer );
 
 	if ( !itemLikeElements.length ) {
 		return;
 	}
 
-	const writer = new UpcastWriter();
-
 	let currentList = null;
 
 	itemLikeElements.forEach( ( itemLikeElement, i ) => {
@@ -45,7 +43,7 @@ export function transformListItemLikeElementsIntoLists( documentFragment, styles
 			currentList = insertNewEmptyList( listStyle, itemLikeElement.element, writer );
 		}
 
-		const listItem = transformElementIntoListItem( itemLikeElement.element, writer, view );
+		const listItem = transformElementIntoListItem( itemLikeElement.element, writer );
 
 		writer.appendChild( listItem, currentList );
 	} );
@@ -55,15 +53,15 @@ export function transformListItemLikeElementsIntoLists( documentFragment, styles
 //
 // @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment
 // in which to look for list-like nodes.
-// @param {module:engine/view/view~View} view
+// @param {module:engine/view/upcastwriter~UpcastWriter} writer
 // @returns {Array.<Object>} Array of found list-like items. Each item is an object containing:
 //
 //		* {module:engine/src/view/element~Element} element List-like element.
 //		* {Number} id List item id parsed from `mso-list` style (see `getListItemData()` function).
 //		* {Number} order List item creation order parsed from `mso-list` style (see `getListItemData()` function).
 //		* {Number} indent List item indentation level parsed from `mso-list` style (see `getListItemData()` function).
-function findAllItemLikeElements( documentFragment, view ) {
-	const range = view.createRangeIn( documentFragment );
+function findAllItemLikeElements( documentFragment, writer ) {
+	const range = writer.createRangeIn( documentFragment );
 
 	// Matcher for finding list-like elements.
 	const itemLikeElementsMatcher = new Matcher( {
@@ -156,8 +154,8 @@ function insertNewEmptyList( listStyle, element, writer ) {
 // @param {module:engine/view/upcastwriter~UpcastWriter} writer
 // @returns {module:engine/view/element~Element} New element to which the given one was transformed. It is
 // inserted in place of the old element (the reference to the old element is lost due to renaming).
-function transformElementIntoListItem( element, writer, view ) {
-	removeBulletElement( element, writer, view );
+function transformElementIntoListItem( element, writer ) {
+	removeBulletElement( element, writer );
 
 	return writer.rename( 'li', element );
 }
@@ -194,8 +192,7 @@ function getListItemData( element ) {
 //
 // @param {module:engine/view/element~Element} element
 // @param {module:engine/view/upcastwriter~UpcastWriter} writer
-// @param {module:engine/view/view~View} view
-function removeBulletElement( element, writer, view ) {
+function removeBulletElement( element, writer ) {
 	// Matcher for finding `span` elements holding lists numbering/bullets.
 	const bulletMatcher = new Matcher( {
 		name: 'span',
@@ -204,7 +201,7 @@ function removeBulletElement( element, writer, view ) {
 		}
 	} );
 
-	const range = view.createRangeIn( element );
+	const range = writer.createRangeIn( element );
 
 	for ( const value of range ) {
 		if ( value.type === 'elementStart' && bulletMatcher.match( value.item ) ) {

diff --git a/src/filters/parse.js b/src/filters/parse.js
@@ -29,6 +29,9 @@ import { normalizeSpacing, normalizeSpacerunSpans } from './space';
 export function parseHtml( htmlString ) {
 	const domParser = new DOMParser();
 
+	// Remove Word specific "if comments" so content inside is not omitted by the parser.
+	htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );
+
 	const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );
 
 	// Parse htmlString as native Document object.

diff --git a/src/filters/space.js b/src/filters/space.js
@@ -11,14 +11,16 @@
  * Replaces last space preceding elements closing tag with `&nbsp;`. Such operation prevents spaces from being removed
  * during further DOM/View processing (see especially {@link module:engine/view/domconverter~DomConverter#_processDataFromDomText}).
  * This method also takes into account Word specific `<o:p></o:p>` empty tags.
+ * Additionally multiline sequences of spaces and new lines between tags are removed (see #39 and #40).
  *
  * @param {String} htmlString HTML string in which spacing should be normalized.
  * @returns {String} Input HTML with spaces normalized.
  */
 export function normalizeSpacing( htmlString ) {
 	return normalizeSafariSpaceSpans( normalizeSafariSpaceSpans( htmlString ) ) // Run normalization two times to cover nested spans.
 		.replace( / <\//g, '\u00A0</' )
-		.replace( / <o:p><\/o:p>/g, '\u00A0<o:p></o:p>' );
+		.replace( / <o:p><\/o:p>/g, '\u00A0<o:p></o:p>' )
+		.replace( />(\s*(\r\n?|\n)\s*)+</g, '><' );
 }
 
 /**

diff --git a/src/filters/utils.js b/src/filters/utils.js
@@ -0,0 +1,22 @@
+/**
+ * @license Copyright (c) 2003-2018, CKSource - Frederico Knabben. All rights reserved.
+ * For licensing, see LICENSE.md.
+ */
+
+/**
+ * @module paste-from-office/filters/utils
+ */
+
+/* globals btoa */
+
+/**
+ * Converts given HEX string to base64 representation.
+ *
+ * @param {String} hexString The HEX string to be converted.
+ * @returns {String} Base64 representation of a given HEX string.
+ */
+export function convertHexToBase64( hexString ) {
+	return btoa( hexString.match( /\w{2}/g ).map( char => {
+		return String.fromCharCode( parseInt( char, 16 ) );
+	} ).join( '' ) );
+}
diff --git a/src/pastefromoffice.js b/src/pastefromoffice.js
@@ -12,6 +12,7 @@ import Clipboard from '@ckeditor/ckeditor5-clipboard/src/clipboard';
 
 import { parseHtml } from './filters/parse';
 import { transformListItemLikeElementsIntoLists } from './filters/list';
+import { replaceImagesSourceWithBase64 } from './filters/image';
 
 /**
  * The Paste from Office plugin.
@@ -41,7 +42,7 @@ export default class PasteFromOffice extends Plugin {
 			const html = data.dataTransfer.getData( 'text/html' );
 
 			if ( isWordInput( html ) ) {
-				data.content = this._normalizeWordInput( html );
+				data.content = this._normalizeWordInput( html, data.dataTransfer );
 			}
 		}, { priority: 'high' } );
 	}
@@ -53,11 +54,14 @@ export default class PasteFromOffice extends Plugin {
 	 *
 	 * @protected
 	 * @param {String} input Word input.
+	 * @param {module:clipboard/datatransfer~DataTransfer} dataTransfer Data transfer instance.
 	 * @returns {module:engine/view/documentfragment~DocumentFragment} Normalized input.
 	 */
-	_normalizeWordInput( input ) {
+	_normalizeWordInput( input, dataTransfer ) {
 		const { body, stylesString } = parseHtml( input );
-		transformListItemLikeElementsIntoLists( body, stylesString, this.editor.editing.view );
+
+		transformListItemLikeElementsIntoLists( body, stylesString );
+		replaceImagesSourceWithBase64( body, dataTransfer.getData( 'text/rtf' ) );
 
 		return body;
 	}

diff --git a/tests/_data/image/adjacent-groups/adjacent-groups.docx b/tests/_data/image/adjacent-groups/adjacent-groups.docx