Skip to content

Commit

Permalink
TIKA-3334 -- fix thread safety bug in handling embedded docs in open …
Browse files Browse the repository at this point in the history
…office parser
  • Loading branch information
tballison committed Mar 23, 2021
1 parent 2b8c9a3 commit da05576
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 13 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Release 1.26 - ??/??/????

* Fix thread safety bug in OpenOffice parser (TIKA-3334).

* The "writeLimit" header now pertains to the combined characters
written per container document (and embedded documents) in the /rmeta
endpoint in tika-server (TIKA-3325); it no longer functions only
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public class OpenDocumentParser extends AbstractParser {
private static final long serialVersionUID = -6410276875438618287L;

private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
MediaType.application("vnd.sun.xml.writer"),
MediaType.application("vnd.oasis.opendocument.text"),
MediaType.application("vnd.oasis.opendocument.graphics"),
Expand Down Expand Up @@ -103,8 +103,6 @@ public class OpenDocumentParser extends AbstractParser {
private static final String META_NAME = "meta.xml";
private static final String MANIFEST_NAME = "META-INF/manifest.xml";

private EmbeddedDocumentUtil embeddedDocumentUtil;

private Parser meta = new OpenDocumentMetaParser();

private Parser content = new OpenDocumentContentParser();
Expand Down Expand Up @@ -136,7 +134,7 @@ public void parse(
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {

embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);

// Open the Zip stream
// Use a File if we can, and an already open zip is even better
Expand Down Expand Up @@ -167,14 +165,14 @@ public void parse(
try {
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
Expand All @@ -200,15 +198,17 @@ public void setExtractMacros(boolean extractMacros) {

private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
ParseContext context,
EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil) throws IOException,
TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
List<SAXException> saxExceptions = new ArrayList<>();
do {
try {
handleZipEntry(entry, zipStream, metadata, context, handler);
handleZipEntry(entry, zipStream, metadata, context, handler, embeddedDocumentUtil);
} catch (SAXException e) {
if (e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
Expand All @@ -225,32 +225,36 @@ private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
}

private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
ParseContext context, EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, TikaException, SAXException {

ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler,
embeddedDocumentUtil);
}
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
entry = zipFile.getEntry(META_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
handler);
handler, embeddedDocumentUtil);
}

Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler,
embeddedDocumentUtil);
}
}
}
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, ContentHandler handler)
ParseContext context, ContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, SAXException, TikaException {
if (entry == null) {
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

Expand Down Expand Up @@ -652,4 +657,38 @@ private ParseContext getNonRecursingParseContext() {
parseContext.set(Parser.class, new EmptyParser());
return parseContext;
}

@Test
public void testMultiThreaded() throws Exception {
int numThreads = 10;
ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
ExecutorCompletionService<Integer> executorCompletionService =
new ExecutorCompletionService<>(executorService);

for (int i = 0; i < numThreads; i++) {
executorCompletionService.submit(new Callable<Integer>() {
@Override
public Integer call() throws Exception {
for (int i = 0; i < 10; i++) {
List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
assertEquals(3, metadataList.size());
assertEquals("THUMBNAIL",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
return 1;
}
});
}

try {
int finished = 0;
while (finished < numThreads) {
Future<Integer> future = executorCompletionService.take();
future.get();
finished++;
}
} finally {
executorService.shutdownNow();
}
}
}

0 comments on commit da05576

Please sign in to comment.