Skip to content

Commit

Permalink
TIKA-3441 -- improve likelihood that tesseract processes will be shut…
Browse files Browse the repository at this point in the history
…down on crash.
  • Loading branch information
tballison committed Jun 9, 2021
1 parent fd98eee commit d7fa2cd
Showing 1 changed file with 52 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -530,37 +530,52 @@ private void doOCR(File input, File output, TesseractOCRConfig config) throws IO

ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
final Process process = pb.start();
Process process = null;
try {
process = pb.start();
runOCRProcess(process, config.getTimeout());
} finally {
if (process != null) {
process.destroyForcibly();
}
}
}

private void runOCRProcess(Process process, int timeout) throws IOException, TikaException {
process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();

logStream("OCR MSG", out, input);
logStream("OCR ERROR", err, input);

FutureTask<Integer> waitTask = new FutureTask<>(new Callable<Integer>() {
public Integer call() throws Exception {
return process.waitFor();
}
});

Thread waitThread = new Thread(waitTask);
waitThread.start();

StringBuilder outBuilder = new StringBuilder();
StringBuilder errBuilder = new StringBuilder();
Thread outThread = logStream(out, outBuilder);
Thread errThread = logStream(err, errBuilder);
outThread.start();
errThread.start();

int exitValue = Integer.MIN_VALUE;
try {
waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
boolean finished = process.waitFor(timeout, TimeUnit.SECONDS);
if (!finished) {
throw new TikaException("TesseractOCRParser timeout");
}
exitValue = process.exitValue();
} catch (InterruptedException e) {
waitThread.interrupt();
process.destroy();
Thread.currentThread().interrupt();
throw new TikaException("TesseractOCRParser interrupted", e);
} catch (ExecutionException e) {
// should not be thrown
} catch (TimeoutException e) {
waitThread.interrupt();
process.destroy();
throw new TikaException("TesseractOCRParser timeout", e);
} catch (IllegalThreadStateException e) {
//this _should_ never be thrown
throw new TikaException("TesseractOCRParser timeout");
}
if (exitValue > 0) {
try {
//make sure this thread is actually done
errThread.join(1000);
} catch (InterruptedException e) {
//swallow
}
throw new TikaException(
"TesseractOCRParser bad exit value " + exitValue + " err msg: " +
errBuilder.toString());
}
}

Expand Down Expand Up @@ -607,24 +622,22 @@ private void extractHOCROutput(InputStream is, ParseContext parseContext,
* stream of the given process to not block the process. The stream is closed
* once fully processed.
*/
private void logStream(final String logType, final InputStream stream, final File file) {
new Thread() {
public void run() {
Reader reader = new InputStreamReader(stream, UTF_8);
StringBuilder out = new StringBuilder();
char[] buffer = new char[1024];
try {
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
out.append(buffer, 0, n);
} catch (IOException e) {

} finally {
IOUtils.closeQuietly(stream);
private Thread logStream(final InputStream stream, final StringBuilder out) {
return new Thread(() -> {
Reader reader = new InputStreamReader(stream, UTF_8);
char[] buffer = new char[1024];
try {
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
out.append(buffer, 0, n);
}

LOG.debug("{}", out);
} catch (IOException e) {
//swallow
} finally {
IOUtils.closeQuietly(stream);
}
}.start();

LOG.debug("{}", out);
});
}

static String getTesseractProg() {
Expand Down

0 comments on commit d7fa2cd

Please sign in to comment.