ckeditor · Reinmar · Oct 25, 2018 · Aug 13, 2018 · Aug 21, 2018 · Aug 21, 2018
diff --git a/package.json b/package.json
@@ -19,8 +19,11 @@
     "@ckeditor/ckeditor5-engine": "^10.2.0",
     "@ckeditor/ckeditor5-enter": "^10.1.1",
     "@ckeditor/ckeditor5-heading": "^10.0.2",
+    "@ckeditor/ckeditor5-link": "^10.0.3",
+    "@ckeditor/ckeditor5-list": "^11.0.1",
     "@ckeditor/ckeditor5-paragraph": "^10.0.2",
     "@ckeditor/ckeditor5-table": "^10.1.0",
+    "@ckeditor/ckeditor5-utils": "^10.2.1",
     "eslint": "^4.15.0",
     "eslint-config-ckeditor5": "^1.0.7",
     "husky": "^0.14.3",

diff --git a/src/filters/list.js b/src/filters/list.js
@@ -0,0 +1,196 @@
+/**
+ * @license Copyright (c) 2003-2018, CKSource - Frederico Knabben. All rights reserved.
+ * For licensing, see LICENSE.md.
+ */
+
+/**
+ * @module pastefromoffice/filters/list
+ */
+
+import Element from '@ckeditor/ckeditor5-engine/src/view/element';
+import Matcher from '@ckeditor/ckeditor5-engine/src/view/matcher';
+import Position from '@ckeditor/ckeditor5-engine/src/view/position';
+import TreeWalker from '@ckeditor/ckeditor5-engine/src/view/treewalker';
+import UpcastWriter from '@ckeditor/ckeditor5-engine/src/view/upcastwriter';
+
+/**
+ * Transforms Word specific list-like elements to the semantic HTML lists.
+ *
+ * Lists in Word are represented by block elements with special attributes like:
+ *
+ *		<p class=MsoListParagraphCxSpFirst style='mso-list:l1 level1 lfo1'>...</p> // Paragraph based list.
+ *		<h1 style='mso-list:l0 level1 lfo1'>...</h1> // Heading 1 based list.
+ *
+ * @param {module:engine/view/node~Node|module:engine/view/documentfragment~DocumentFragment} bodyView The view
+ * structure which to transform.
+ * @returns {module:engine/view/node~Node|module:engine/view/documentfragment~DocumentFragment} The view
+ * structure instance with list-like elements transformed into semantic lists.
+ */
+export function paragraphsToLists( bodyView, stylesString ) {
+	const firstChild = bodyView.getChild( 0 );
+
+	if ( firstChild ) {
+		const listNodes = findAllListNodes( Position.createBefore( firstChild ) );
+		createLists( listNodes, stylesString );
+	}
+
+	return bodyView;
+}
+
+// Writer used for View elements manipulation.
+const writer = new UpcastWriter();
+
+// Matcher for finding list-like elements.
+const listMatcher = new Matcher( {
+	name: /^p|h\d+$/,
+	styles: {
+		'mso-list': /.*/
+	}
+} );
+
+// Matcher for finding `span` elements holding lists numbering/bullets.
+const listBulletMatcher = new Matcher( {
+	name: 'span',
+	styles: {
+		'mso-list': 'Ignore'
+	}
+} );
+
+// Finds all list-like nodes starting from a given position.
+//
+// @param {module:engine/src/view/position~Position} startPosition Position from which to start looking.
+// @returns {Array.<Object>} Array of found list items. Each item is an object containing:
+//
+//		* {module:engine/src/view/element~Element} element List-like element.
+//		* {Number} id List item id parsed from `mso-list` style (see `getListItemData()` function).
+//		* {Number} order List item creation order parsed from `mso-list` style (see `getListItemData()` function).
+//		* {Number} indent List item indentation level parsed from `mso-list` style (see `getListItemData()` function).
+function findAllListNodes( startPosition ) {
+	const treeWalker = new TreeWalker( { startPosition, ignoreElementEnd: true } );
+
+	// Find all list nodes.
+	const listNodes = [];
+	for ( const value of treeWalker ) {
+		if ( value.type === 'elementStart' && listMatcher.match( value.item ) ) {
+			const itemData = getListItemData( value.item );
+
+			listNodes.push( {
+				element: value.item,
+				id: itemData.id,
+				order: itemData.order,
+				indent: itemData.indent
+			} );
+		}
+	}
+
+	return listNodes;
+}
+
+// Transforms given list-like nodes into semantic lists. As the function operates on a provided
+// {module:engine/src/view/element~Element elements}, it will modify the view structure to which those list elements belongs.
+//
+// @param {Array.<Object>} listItems Array containing list items data. Usually it is the output of `findAllListNodes()` function.
+// @param {String} styles CSS styles which may contain additional data about lists format.
+function createLists( listItems, styles ) {
+	if ( listItems.length ) {
+		let currentList = null;
+		let previousListItem = null;
+
+		for ( const listItem of listItems ) {
+			const listNode = listItem.element;
+
+			if ( !previousListItem || previousListItem.id !== listItem.id ) {
+				const listStyle = findListType( listItem, styles );
+				currentList = new Element( listStyle.type );
+				writer.insertChild( listNode.parent.getChildIndex( listNode ), currentList, listNode.parent );
+			}
+
+			removeBulletElement( listNode );
+
+			writer.appendChild( listNode, currentList );
+			writer.rename( 'li', listNode );
+
+			previousListItem = listItem;
+		}
+	}
+}
+
+// Extracts list information from Word specific list style like:
+//
+//		`style="mso-list:l1 level1 lfo1"`
+//
+// where:
+//
+//		* `l1` is a list id (all elements with the same id belongs to the same list),
+//		* `level1` is a list item indentation level,
+//		* `lfo1` is a list insertion order in a document.
+//
+// @param {module:engine/view/element~Element} element List-like element from which data is extracted.
+// @returns {Object} result
+// @returns {Number} result.id List id.
+// @returns {Number} result.order List creation order.
+// @returns {Number} result.indent List indentation level.
+function getListItemData( element ) {
+	const data = {};
+	const listStyle = element.getStyle( 'mso-list' );
+
+	if ( listStyle ) {
+		data.id = parseInt( listStyle.match( /(^|\s+)l(\d+)/i )[ 2 ] );
+		data.order = parseInt( listStyle.match( /\s*lfo(\d+)/i )[ 1 ] );
+		data.indent = parseInt( listStyle.match( /\s*level(\d+)/i )[ 1 ] );
+	}
+
+	return data;
+}
+
+// Checks list item style based on a provided CSS.
+//
+// List item style is extracted from CSS stylesheet. Each list with its specific style attribute value (`mso-list:l1 level1 lfo1`)
+// has its dedicated properties in a CSS stylesheet defined with a selector like:
+//
+// 		@list l1:level1 { ... }
+//
+// It contains `mso-level-number-format` property which defines list numbering/bullet style. If this property
+// is not defined it means default `decimal` numbering.
+//
+// Here CSS string representation is used as `mso-level-number-format` is invalid CSS property which gets removed during parsing.
+//
+// @param {Object} listItem List item for which list style will be searched for.
+// @param {String} styles CSS stylesheet.
+// @returns {Object} result
+// @returns {String} result.type Type of the list, could be `ul` or `ol`.
+// @returns {String} result.style List style like `decimal`, `lower-roman`, etc. It is passed directly from Word stylesheet
+// so may be not compatible with CSS `list-style-type` accepted values.
+function findListType( listItem, styles ) {
+	const listStyleRegexp = new RegExp( `@list l${ listItem.id }:level${ listItem.indent }\\s*({[^}]*)`, 'gi' );
+	const listStyleTypeRegex = /mso-level-number-format:([^;]*);/gi;
+
+	const listStyleMatch = listStyleRegexp.exec( styles );
+
+	let listStyleType = 'decimal'; // Decimal is default one.
+	if ( listStyleMatch && listStyleMatch[ 1 ] ) {
+		const listStyleTypeMatch = listStyleTypeRegex.exec( listStyleMatch[ 1 ] );
+
+		if ( listStyleTypeMatch && listStyleTypeMatch[ 1 ] ) {
+			listStyleType = listStyleTypeMatch[ 1 ].trim();
+		}
+	}
+
+	return {
+		type: listStyleType !== 'bullet' && listStyleType !== 'image' ? 'ol' : 'ul',
+		style: listStyleType
+	};
+}
+
+// Removes span with a numbering/bullet from the given list element.
+//
+// @param {module:engine/view/element~Element} listElement
+function removeBulletElement( listElement ) {
+	const treeWalker = new TreeWalker( { startPosition: Position.createBefore( listElement.getChild( 0 ) ), ignoreElementEnd: true } );
+
+	for ( const value of treeWalker ) {
+		if ( value.type === 'elementStart' && listBulletMatcher.match( value.item ) ) {
+			writer.remove( value.item );
+		}
+	}
+}
diff --git a/src/filters/utils.js b/src/filters/utils.js
@@ -0,0 +1,88 @@
+/**
+ * @license Copyright (c) 2003-2018, CKSource - Frederico Knabben. All rights reserved.
+ * For licensing, see LICENSE.md.
+ */
+
+/**
+ * @module pastefromoffice/filters/utils
+ */
+
+/* globals DOMParser */
+
+import DomConverter from '@ckeditor/ckeditor5-engine/src/view/domconverter';
+import { NBSP_FILLER } from '@ckeditor/ckeditor5-engine/src/view/filler';
+
+const domParser = new DOMParser();
+const domConverter = new DomConverter( { blockFiller: NBSP_FILLER } );
+
+/**
+ * Parses provided HTML extracting contents of `body` and `style` tags.
+ *
+ * @param {String} htmlString HTML string to be parsed.
+ * @returns {Object} result
+ * @returns {module:engine/view/documentfragment~DocumentFragment} result.body Parsed body
+ * content as a traversable structure.
+ * @returns {String} result.bodyString Entire body content as a string.
+ * @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` objects, each representing
+ * separate `style` tag from the source HTML.
+ * @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence into one string.
+ */
+export function parseHtml( htmlString ) {
+	// Parse htmlString as native Document object.
+	const htmlDocument = domParser.parseFromString( htmlString, 'text/html' );
+
+	// Get `innerHTML` first as transforming to View modifies the source document.
+	const bodyString = htmlDocument.body.innerHTML;
+
+	// Transform document.body to View.
+	const bodyView = documentToView( htmlDocument );
+
+	// Extract stylesheets.
+	const stylesObject = extractStyles( htmlDocument );
+
+	return {
+		body: bodyView,
+		bodyString,
+		styles: stylesObject.styles,
+		stylesString: stylesObject.stylesString
+	};
+}
+
+// Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}.
+//
+// @param {Document} htmlDocument Native `Document` object to be transformed.
+// @returns {module:engine/view/documentfragment~DocumentFragment}
+function documentToView( htmlDocument ) {
+	const fragment = htmlDocument.createDocumentFragment();
+	const nodes = htmlDocument.body.childNodes;
+
+	while ( nodes.length > 0 ) {
+		fragment.appendChild( nodes[ 0 ] );
+	}
+
+	return domConverter.domToView( fragment );
+}
+
+// Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
+//
+// @param {Document} htmlDocument Native `Document` object from which styles will be extracted.
+// @returns {Object} result
+// @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` object, each representing
+// separate `style` tag from the source object.
+// @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence as one string.
+function extractStyles( htmlDocument ) {
+	const styles = [];
+	const stylesString = [];
+
+	for ( const el of htmlDocument.all ) {
+		if ( el.tagName.toLowerCase() === 'style' && el.sheet && el.sheet.rules && el.sheet.rules.length ) {
+			styles.push( el.sheet );
+			stylesString.push( el.innerHTML );
+		}
+	}
+
+	return {
+		styles,
+		stylesString: stylesString.join( ' ' )
+	};
+}
diff --git a/src/pastefromoffice.js b/src/pastefromoffice.js
@@ -0,0 +1,68 @@
+/**
+ * @license Copyright (c) 2003-2018, CKSource - Frederico Knabben. All rights reserved.
+ * For licensing, see LICENSE.md.
+ */
+
+/**
+ * @module pastefromoffice/pastefromoffice
+ */
+
+import Plugin from '@ckeditor/ckeditor5-core/src/plugin';
+import Clipboard from '@ckeditor/ckeditor5-clipboard/src/clipboard';
+
+import { parseHtml } from './filters/utils';
+import { paragraphsToLists } from './filters/list';
+
+/**
+ * This plugin handles content pasted from Word and transforms it (if necessary)
+ * to format suitable for editor {@link module:engine/model/model~Model}.
+ *
+ * @extends module:core/plugin~Plugin
+ */
+export default class PasteFromOffice extends Plugin {
+	/**
+	 * @inheritDoc
+	 */
+	static get pluginName() {
+		return 'PasteFromOffice';
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	init() {
+		const editor = this.editor;
+
+		this.listenTo( editor.plugins.get( Clipboard ), 'inputTransformation', ( evt, data ) => {
+			const html = data.dataTransfer.getData( 'text/html' );
+
+			if ( isWordInput( html ) ) {
+				data.content = this._normalizeWordInput( html );
+			}
+		}, { priority: 'high' } );
+	}
+
+	/**
+	 * Normalizes input pasted from Word to format suitable for editor {@link module:engine/model/model~Model}.
+	 *
+	 * **Notice**: this function was exposed mainly for testing purposes and should not be called directly.
+	 *
+	 * @protected
+	 * @param {String} input Word input.
+	 * @returns {module:engine/view/documentfragment~DocumentFragment} Normalized input.
+	 */
+	_normalizeWordInput( input ) {
+		const { body, stylesString } = parseHtml( input );
+		const normalizedInput = paragraphsToLists( body, stylesString );
+
+		return normalizedInput;
+	}
+}
+
+// Checks if given HTML string is a result of pasting content from Word.
+//
+// @param {String} html HTML string to test.
+// @returns {Boolean} True if given HTML string is a Word HTML.
+function isWordInput( html ) {
+	return !!( html && html.match( /<meta\s*name="?generator"?\s*content="?microsoft\s*word\s*\d+"?\/?>/gi ) );
+}
diff --git a/...es/bold-within-text/bold-within-text.docx → ...es/bold-within-text/bold-within-text.docx b/...es/bold-within-text/bold-within-text.docx → ...es/bold-within-text/bold-within-text.docx
diff --git a/...yles/bold-within-text/input.word2016.html → ...yles/bold-within-text/input.word2016.html b/...yles/bold-within-text/input.word2016.html → ...yles/bold-within-text/input.word2016.html
diff --git a/.../italic-starting-text/input.word2016.html → .../italic-starting-text/input.word2016.html b/.../italic-starting-text/input.word2016.html → .../italic-starting-text/input.word2016.html
diff --git a/...c-starting-text/italic-starting-text.docx → ...c-starting-text/italic-starting-text.docx b/...c-starting-text/italic-starting-text.docx → ...c-starting-text/italic-starting-text.docx
diff --git a/...iple-styles-multiline/input.word2016.html → ...iple-styles-multiline/input.word2016.html b/...iple-styles-multiline/input.word2016.html → ...iple-styles-multiline/input.word2016.html
diff --git a/...s-multiline/multiple-style-multiline.docx → ...s-multiline/multiple-style-multiline.docx b/...s-multiline/multiple-style-multiline.docx → ...s-multiline/multiple-style-multiline.docx
diff --git a/...le-styles-single-line/input.word2016.html → ...le-styles-single-line/input.word2016.html b/...le-styles-single-line/input.word2016.html → ...le-styles-single-line/input.word2016.html
diff --git a/...ngle-line/multiple-style-single-line.docx → ...ngle-line/multiple-style-single-line.docx b/...ngle-line/multiple-style-single-line.docx → ...ngle-line/multiple-style-single-line.docx
diff --git a/...kethrough-ending-text/input.word2016.html → ...kethrough-ending-text/input.word2016.html b/...kethrough-ending-text/input.word2016.html → ...kethrough-ending-text/input.word2016.html
diff --git a/...nding-text/strikethrough-ending-text.docx → ...nding-text/strikethrough-ending-text.docx b/...nding-text/strikethrough-ending-text.docx → ...nding-text/strikethrough-ending-text.docx
diff --git a/...tyles/underlined-text/input.word2016.html → ...tyles/underlined-text/input.word2016.html b/...tyles/underlined-text/input.word2016.html → ...tyles/underlined-text/input.word2016.html
diff --git a/...yles/underlined-text/underlined-text.docx → ...yles/underlined-text/underlined-text.docx b/...yles/underlined-text/underlined-text.docx → ...yles/underlined-text/underlined-text.docx
diff --git a/...a/integration/link/combined/combined.docx → tests/_data/link/combined/combined.docx b/...a/integration/link/combined/combined.docx → tests/_data/link/combined/combined.docx
diff --git a/...gration/link/combined/input.word2016.html → ...s/_data/link/combined/input.word2016.html b/...gration/link/combined/input.word2016.html → ...s/_data/link/combined/input.word2016.html
diff --git a/...gration/link/two-line/input.word2016.html → ...s/_data/link/two-line/input.word2016.html b/...gration/link/two-line/input.word2016.html → ...s/_data/link/two-line/input.word2016.html
diff --git a/...a/integration/link/two-line/two-line.docx → tests/_data/link/two-line/two-line.docx b/...a/integration/link/two-line/two-line.docx → tests/_data/link/two-line/two-line.docx
diff --git a/...tion/link/within-text/input.word2016.html → ...data/link/within-text/input.word2016.html b/...tion/link/within-text/input.word2016.html → ...data/link/within-text/input.word2016.html
diff --git a/...gration/link/within-text/within-text.docx → ...s/_data/link/within-text/within-text.docx b/...gration/link/within-text/within-text.docx → ...s/_data/link/within-text/within-text.docx
diff --git a/tests/_data/list/heading1/heading1.docx b/tests/_data/list/heading1/heading1.docx