Skip to content

Commit

Permalink
bbottema#9 Implemented RTF to HTML conversion according to RTF spec
Browse files Browse the repository at this point in the history
  • Loading branch information
Konstantin committed Oct 12, 2019
1 parent f1657dd commit 55bc3a1
Show file tree
Hide file tree
Showing 7 changed files with 1,834 additions and 145 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,150 +18,149 @@
import org.simplejavamail.outlookmessageparser.rtf.util.CharsetHelper;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.regex.Pattern.compile;
import static org.simplejavamail.outlookmessageparser.rtf.util.ByteUtil.hexToString;
import static org.simplejavamail.outlookmessageparser.rtf.util.CharsetHelper.WINDOWS_CHARSET;

/**
* This class is intended to be used for certain RTF related operations such as extraction of plain HTML from an RTF text.
*/
public class SimpleRTF2HTMLConverter implements RTF2HTMLConverter {
private static Pattern CONTROL_WORD = Pattern.compile("\\\\(([^a-zA-Z])|(([a-zA-Z]+)(-?[\\d]*) ?))");
private static Pattern ENCODED_CHARACTER = Pattern.compile("\\\\'([0-9a-fA-F]{2})");

private static final String[] HTML_START_TAGS = { "<html", "<Html", "<HTML" };
private static final String[] HTML_END_TAGS = { "</html>", "</Html>", "</HTML>" };

public String rtf2html(final String rtf) {
if (rtf != null) {
final Charset charset = extractCodepage(rtf);
String plain = fetchHtmlSection(rtf);
plain = replaceSpecialSequences(plain); // first step, remove known control words or else we'll match single escape hex values in the next step
plain = replaceHexSequences(plain, "(?:\\\\f\\d(?:\\\\'..)+)", WINDOWS_CHARSET); // match all header control values with default charset
plain = replaceHexSequences(plain, "(?:\\\\'..)+", charset); // match all remaining escaped hex values as encoded text (which might be DBCS like CP936)
plain = cleanupRemainingSequences(plain);
plain = replaceLineBreaks(plain);
return plain;
}
return null;
}

private String cleanupRemainingSequences(String plain) {
return plain
.replaceAll("(\\\\f\\d.+?;)+", "") // clear all \f sequences including fontnames like Courier new
.replaceAll("\\\\\\S+", "") // filtering all remaining \<rtfsequence> like e.g.: \htmlrtf
.replaceAll("BM__MailAutoSig((?s).*?(?-s))BM__MailAutoSig", "$1");
}

private Charset extractCodepage(String rtf) {
Matcher codePageMatcher = compile("(?:\\\\ansicpg(?<codePage>.+?)\\\\)+").matcher(rtf);
if (codePageMatcher.find()) {
return CharsetHelper.findCharset(codePageMatcher.group("codePage"));
} else {
return WINDOWS_CHARSET; // fallback
}
}

/**
* @return The text with removed newlines as they are only part of the RTF document and should not be inside the HTML.
*/
private String replaceLineBreaks(final String text) {
return text
.replaceAll("( <br/> ( <br/> )+)", " <br/> ")
.replaceAll("\\r\\n", "\n")
.replaceAll("[\\r\\u0000]", "");
}

/**
* @return The text with replaced special characters that denote hex codes for strings using Windows CP1252 encoding.
*/
private String replaceHexSequences(final String text, String sequencesToMatch, final Charset charset) {
final StringBuilder res = new StringBuilder();
int lastPosition = 0;

final Matcher escapedHexGroupMatcher = compile(sequencesToMatch).matcher(text);
while (escapedHexGroupMatcher.find()) {
res.append(text, lastPosition, escapedHexGroupMatcher.start());

StringBuilder hexText = new StringBuilder();

String escapedHexGroup = escapedHexGroupMatcher.group(0);
final Matcher unescapedHexCharacterMatcher = compile("\\\\'(..)").matcher(escapedHexGroup);
while (unescapedHexCharacterMatcher.find()) {
hexText.append(unescapedHexCharacterMatcher.group(1));
}

res.append(hexToString(hexText.toString(), charset));

lastPosition = escapedHexGroupMatcher.end();
}

if (res.length() == 0) {
res.append(text);
} else {
res.append(text, lastPosition, text.length());
}

return res.toString();
}

/**
* @return The actual HTML block / section only but still with RTF code inside (still needs to be cleaned).
*/
private String fetchHtmlSection(final String text) {
int htmlStart = -1;
int htmlEnd = -1;

//determine html tags
for (int i = 0; i < HTML_START_TAGS.length && htmlStart < 0; i++) {
htmlStart = text.indexOf(HTML_START_TAGS[i]);
}
for (int i = 0; i < HTML_END_TAGS.length && htmlEnd < 0; i++) {
htmlEnd = text.indexOf(HTML_END_TAGS[i]);
if (htmlEnd > 0) {
htmlEnd = htmlEnd + HTML_END_TAGS[i].length();
}
}

if (htmlStart > -1 && htmlEnd > -1) {
//trim rtf code
return text.substring(htmlStart, htmlEnd + 1);
} else {
//embed code within html tags
String html = "<html><body style=\"font-family:'Courier',monospace;font-size:10pt;\">" + text + "</body></html>";
//replace linebreaks with html breaks
html = html.replaceAll("[\\n\\r]+", " ");
//create hyperlinks
html = html.replaceAll("(http://\\S+)", "<a href=\"$1\">$1</a>");
return html.replaceAll("mailto:(\\S+@\\S+)", "<a href=\"mailto:$1\">$1</a>");
}
}

/**
* @return The text with special sequences replaced by equivalent representations.
*/
private String replaceSpecialSequences(final String text) {
String replacedText = text;
//filtering whatever color control sequence, e.g. {\sp{\sn fillColor}{\sv 14935011}}{\sp{\sn fFilled}{\sv 1}}
replacedText = replacedText.replaceAll("\\{\\\\S+ [^\\s\\\\}]*\\}", "");
//filtering hyperlink sequences like {HYPERLINK "http://xyz.com/print.jpg"}
replacedText = replacedText.replaceAll("\\{HYPERLINK[^\\}]*\\}", "");
//filtering plain replacedText sequences like {\pntext *\tab}
replacedText = replacedText.replaceAll("\\{\\\\pntext[^\\}]*\\}", "");
//filtering embedded tags like {\*\htmltag84 &#43;}
replacedText = replacedText.replaceAll("\\{\\\\\\*\\\\htmltag\\d+ (&[#\\w]+;)}\\\\htmlrtf.*\\\\htmlrtf0 ", "$1");
//filtering curly braces that are NOT escaped with backslash }, thus marking the end of an RTF sequence
replacedText = replacedText.replaceAll("([^\\\\])" + "\\}+", "$1");
replacedText = replacedText.replaceAll("([^\\\\])" + "\\{+", "$1");
//filtering curly braces that are escaped with backslash \}, thus representing an actual brace
replacedText = replacedText.replaceAll("\\\\\\}", "}");
replacedText = replacedText.replaceAll("\\\\\\{", "{");
//filtering \par sequences
replacedText = replacedText.replaceAll("\\\\pard*", "\n");
//filtering \tab sequences
replacedText = replacedText.replaceAll("\\\\tab", "\t");
//filtering \*\<rtfsequence> like e.g.: \*\fldinst
replacedText = replacedText.replaceAll("\\\\\\*\\\\\\S+", "");
return replacedText;
}
public String rtf2html(String rtf) {
Charset charset = WINDOWS_CHARSET;

// RTF processing requires stack holding current settings, each group adds new settings to stack
LinkedList<Group> groupStack = new LinkedList<>();
groupStack.add(new Group());

Matcher controlWordMatcher = CONTROL_WORD.matcher(rtf);
Matcher encodedCharMatcher = ENCODED_CHARACTER.matcher(rtf);
StringBuilder result = new StringBuilder();
int length = rtf.length();
int charIndex = 0;

while (charIndex < length) {
char c = rtf.charAt(charIndex);
Group currentGroup = groupStack.getFirst();
if (c == '\r' || c == '\n') {
charIndex++;
} else if (c == '{') { //entering group
groupStack.addFirst(currentGroup.copy());
charIndex++;
} else if (c == '}') { //exiting group
groupStack.removeFirst();
//Not outputting anything after last closing brace matching opening brace.
if (groupStack.size() == 1) {
break;
}
charIndex++;
} else if (c == '\\') {

// matching ansi-encoded sequences like \'f5\'93
encodedCharMatcher.region(charIndex, length);
if (encodedCharMatcher.lookingAt()) {
StringBuilder encodedSequence = new StringBuilder();
while (encodedCharMatcher.lookingAt()) {
encodedSequence.append(encodedCharMatcher.group(1));
charIndex += 4;
encodedCharMatcher.region(charIndex, length);
}
String decoded = hexToString(encodedSequence.toString(), charset);
append(result, decoded, currentGroup);
continue;
}

// set matcher to current char position and match from it
controlWordMatcher.region(charIndex, length);
if (!controlWordMatcher.lookingAt()) {
throw new IllegalStateException("RTF file has invalid structure. Failed to match character '" +
c + "' at [" + charIndex + "/" + length + "] to a control symbol or word.");
}

//checking for control symbol or control word
//control word can have optional number following it and the option space as well
Integer controlNumber = null;
String controlWord = controlWordMatcher.group(2); // group(2) matches control symbol
if (controlWord == null) {
controlWord = controlWordMatcher.group(4); // group(2) matches control word
String controlNumberString = controlWordMatcher.group(5);
if (!"".equals(controlNumberString)) {
controlNumber = Integer.valueOf(controlNumberString);
}
}
charIndex += controlWordMatcher.end() - controlWordMatcher.start();

switch (controlWord) {
case "par":
append(result, "\n", currentGroup);
break;
case "tab":
append(result, "\t", currentGroup);
break;
case "htmlrtf":
//htmlrtf starts ignored text area, htmlrtf0 ends it
//Though technically this is not a group, it's easier to treat it as such to ignore everything in between
currentGroup.htmlRtf = controlNumber == null;
break;
case "ansicpg":
//charset definition is important for decoding ansi encoded values
charset = CharsetHelper.findCharset(controlNumber);
break;
case "fonttbl": // skipping these groups contents - these are font and color settings
case "colortbl":
currentGroup.ignore = true;
break;
case "uc": // This denotes a number of characters to skip after unicode symbols
currentGroup.unicodeCharLength = controlNumber == null ? 1 : controlNumber;
break;
case "u": // Unicode symbols
if (controlNumber != null) {
char unicodeSymbol = (char) controlNumber.intValue();
append(result, Character.toString(unicodeSymbol), currentGroup);
charIndex += currentGroup.unicodeCharLength;
}
break;
case "{": // Escaped characters
case "}":
case "\\":
append(result, controlWord, currentGroup);
break;
default:
}

} else {
append(result, c + "", currentGroup);
charIndex++;
}
}
return result.toString();
}


private void append(StringBuilder result, String symbol, Group group) {
if (group.ignore || group.htmlRtf) {
return;
}
result.append(symbol);
}

private static class Group {
boolean ignore = false;
int unicodeCharLength = 1;
boolean htmlRtf = false;

Group copy() {
Group newGroup = new Group();
newGroup.ignore = this.ignore;
newGroup.unicodeCharLength = this.unicodeCharLength;
newGroup.htmlRtf = this.htmlRtf;
return newGroup;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ public class CharsetHelper {

public static final Charset WINDOWS_CHARSET = Charset.forName("CP1252");

public static Charset findCharset(String rtfCodePage) {
public static Charset findCharset(Integer rtfCodePage) {
for (String prefix : CHARSET_PREFIXES) {
try {
return Charset.forName(prefix + rtfCodePage);
} catch (UnsupportedCharsetException ignore) {
// ignore
}
}
throw new UnsupportedCharsetException(rtfCodePage);
throw new UnsupportedCharsetException("" + rtfCodePage);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Scanner;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.assertj.core.api.Assertions.assertThat;
import static org.simplejavamail.outlookmessageparser.TestUtils.classpathFileToString;
import static org.simplejavamail.outlookmessageparser.TestUtils.normalizeText;

public class HighoverEmailsTest {

Expand Down Expand Up @@ -492,8 +493,7 @@ public void testChineseMessage()
"酒店研发部\n" +
" \n");

InputStream resourceAsStream = OutlookMessageParser.class.getClassLoader().getResourceAsStream("test-messages/chinese message.html");
String expectedHtml = new Scanner(resourceAsStream, UTF_8.name()).useDelimiter("\\A").next();
String expectedHtml = classpathFileToString("/test-messages/chinese message.html", UTF_8);
assertThat(normalizeText(msg.getConvertedBodyHTML())).isEqualTo(normalizeText(expectedHtml));
}

Expand Down Expand Up @@ -803,10 +803,6 @@ private static OutlookRecipient createRecipient(String toName, String toEmail) {
return recipient;
}

private static String normalizeText(String text) {
return text.replaceAll("\\r\\n", "\n").replaceAll("\\r", "\n");
}

private static OutlookMessage parseMsgFile(String msgPath)
throws IOException {
InputStream resourceAsStream = OutlookMessageParser.class.getClassLoader().getResourceAsStream(msgPath);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (C) ${project.inceptionYear} Benny Bottema ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.simplejavamail.outlookmessageparser;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;

public class TestUtils {
public static String classpathFileToString(String classPathFile, Charset charset) {
try {
return new String(Files.readAllBytes(Paths.get(TestUtils.class.getResource(classPathFile).toURI())));
} catch (IOException | URISyntaxException e) {
throw new RuntimeException(e);
}
}


public static String normalizeText(String text) {
return text.replaceAll("\\r\\n", "\n").replaceAll("\\r", "\n");
}
}
Loading

0 comments on commit 55bc3a1

Please sign in to comment.