From a22017ad6f971712d8d87953109b00b757405630 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 20:23:43 -0300 Subject: [PATCH 01/31] '#1823: new audio transcription params for Whisper, rename old ones --- .../config/conf/AudioTranscriptConfig.txt | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 67ac350618..fc62f42d7b 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -9,14 +9,23 @@ # you should download it from https://alphacephei.com/vosk/models and put in 'models/vosk/[lang]' folder. implementationClass = iped.engine.task.transcript.VoskTranscriptTask -# Uses a local/remote wav2vec2 implementation for transcription. Accuracy is much better than most Vosk models. -# The local impl is AT LEAST 1 order of magnitude slower than Vosk on high end CPUs. Using a good GPU is highly recommended! -# The remote impl is useful if you have a central server/cluster with many GPUs to be shared among processing nodes. -# For both the local or remote options, please check the installation steps: https://github.com/sepinf-inc/IPED/wiki/User-Manual#wav2vec2 -# If you use the local implementation, you must set 'huggingFaceModel' param below. -# If you use the remote implementation, you must set 'wav2vec2Service' param below. +# Uses a local wav2vec2 implementation for transcription. Accuracy is much better than most Vosk models. +# This is AT LEAST 1 order of magnitude slower than Vosk on high end CPUs. Using a good GPU is highly recommended! +# Please check the installation steps: https://github.com/sepinf-inc/IPED/wiki/User-Manual#wav2vec2 +# If you enable this, you must set 'huggingFaceModel' param below. #implementationClass = iped.engine.task.transcript.Wav2Vec2TranscriptTask -#implementationClass = iped.engine.task.transcript.RemoteWav2Vec2TranscriptTask + +# Uses a local Whiper implementation for transcription. Accuracy is better than wav2vec2 depending on the model. +# This is slower than wav2vec2 depending on the model. Using a very good GPU is highly recommended! +# Please check the installation steps: https://github.com/sepinf-inc/IPED/wiki/User-Manual#whisper +# If you enable this, you must set 'whisperModel' param below. +#implementationClass = iped.engine.task.transcript.WhisperTranscriptTask + +# Uses a remote service for transcription. +# The remote service is useful if you have a central server/cluster with many GPUs to be shared among processing nodes. +# Please check steps on https://github.com/sepinf-inc/IPED/wiki/User-Manual#wav2vec2 +# If enable this, you must set 'remoteServiceAddress' param below. +#implementationClass = iped.engine.task.transcript.RemoteAudioTranscriptTask # If you want to use the Microsoft Azure service implementation, comment above and uncomment below. # You MUST include Microsoft client-sdk.jar into plugins folder. @@ -91,11 +100,20 @@ minWordScore = 0.5 # huggingFaceModel = jonatasgrosman/wav2vec2-xls-r-1b-french ######################################### -# RemoteWav2Vec2TranscriptTask options +# Local WhisperTranscriptTask options +######################################### + +# Possible values: tiny, base, small, medium, large-v3 +# large-v3 is much better than medium, but 2x slower and uses 2x more memory. +# If you know the language you want to transcribe, please set the 'language' option above. Auto detection causes mistakes. +whisperModel = medium + +######################################### +# RemoteAudioTranscriptTask options ######################################### # IP:PORT of the service/central node used by the RemoteWav2Vec2TranscriptTask implementation. -# wav2vec2Service = 127.0.0.1:11111 +# remoteServiceAddress = 127.0.0.1:11111 ######################################### # MicrosoftTranscriptTask options From 86631420b09b69fc35c8ae859d86e4a90120a9b0 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 20:24:10 -0300 Subject: [PATCH 02/31] '#1823: load new transcription parameters --- .../engine/config/AudioTranscriptConfig.java | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java b/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java index 198d73e62b..e950824bb2 100644 --- a/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java +++ b/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java @@ -27,7 +27,9 @@ public class AudioTranscriptConfig extends AbstractTaskPropertiesConfig { private static final String MAX_REQUESTS_KEY = "maxConcurrentRequests"; private static final String MIN_WORD_SCORE = "minWordScore"; public static final String HUGGING_FACE_MODEL = "huggingFaceModel"; + public static final String WHISPER_MODEL = "whisperModel"; public static final String WAV2VEC2_SERVICE = "wav2vec2Service"; + public static final String REMOTE_SERVICE = "remoteServiceAddress"; private static final String GOOGLE_MODEL = "googleModel"; private static final String LANG_AUTO_VAL = "auto"; private static final String SKIP_KNOWN_FILES = "skipKnownFiles"; @@ -43,7 +45,8 @@ public class AudioTranscriptConfig extends AbstractTaskPropertiesConfig { private int maxConcurrentRequests; private float minWordScore = 0.7f; private String huggingFaceModel; - private String wav2vec2Service; + private String whisperModel; + private String remoteService; private String googleModel; private boolean skipKnownFiles = true; @@ -109,8 +112,12 @@ public String getHuggingFaceModel() { return huggingFaceModel; } - public String getWav2vec2Service() { - return wav2vec2Service; + public String getWhisperModel() { + return whisperModel; + } + + public String getRemoteService() { + return remoteService; } public String getGoogleModel() { @@ -144,9 +151,16 @@ public void processProperties(UTF8Properties properties) { if (huggingFaceModel != null) { huggingFaceModel = huggingFaceModel.trim(); } - wav2vec2Service = properties.getProperty(WAV2VEC2_SERVICE); - if (wav2vec2Service != null) { - wav2vec2Service = wav2vec2Service.trim(); + whisperModel = properties.getProperty(WHISPER_MODEL); + if (whisperModel != null) { + whisperModel = whisperModel.strip(); + } + remoteService = properties.getProperty(REMOTE_SERVICE); + if (remoteService == null) { + remoteService = properties.getProperty(WAV2VEC2_SERVICE); + } + if (remoteService != null) { + remoteService = remoteService.trim(); } googleModel = properties.getProperty(GOOGLE_MODEL); if (googleModel != null) { From 3095e66f483a5c9dcfc0bb251917bef9effce19f Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 20:26:02 -0300 Subject: [PATCH 03/31] '#1823: rename RemoteWav2Vec2TranscriptTask to RemoteAudioTranscriptTask --- .../transcript/RemoteAudioTranscriptTask.java | 318 +++++++++++++++++ .../RemoteWav2Vec2TranscriptTask.java | 321 +----------------- 2 files changed, 325 insertions(+), 314 deletions(-) create mode 100644 iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java new file mode 100644 index 0000000000..770242a248 --- /dev/null +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java @@ -0,0 +1,318 @@ +package iped.engine.task.transcript; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.DataOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.net.ConnectException; +import java.net.Socket; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.tika.io.TemporaryResources; + +import iped.configuration.IConfigurationDirectory; +import iped.data.IItem; +import iped.engine.config.AudioTranscriptConfig; +import iped.engine.config.ConfigurationManager; +import iped.engine.core.Manager; +import iped.engine.io.TimeoutException; +import iped.engine.task.transcript.RemoteWav2Vec2Service.MESSAGES; +import iped.exception.IPEDException; + +public class RemoteAudioTranscriptTask extends AbstractTranscriptTask { + + private static Logger logger = LogManager.getLogger(Wav2Vec2TranscriptTask.class); + + private static final int MAX_CONNECT_ERRORS = 60; + + private static final int UPDATE_SERVERS_INTERVAL_MILLIS = 60000; + + private static List servers = new ArrayList<>(); + + private static int currentServer = -1; + + private static AtomicInteger numConnectErrors = new AtomicInteger(); + + private static AtomicLong audioSendingTime = new AtomicLong(); + + private static AtomicLong transcriptReceiveTime = new AtomicLong(); + + private static AtomicBoolean statsPrinted = new AtomicBoolean(); + + private static long lastUpdateServersTime = 0; + + private static class Server { + + String ip; + int port; + + public String toString() { + return ip + ":" + port; + } + } + + // See https://github.com/sepinf-inc/IPED/issues/1576 + private int getRetryIntervalMillis() { + // This depends on how much time worker nodes need to consume their queue. + // Of course audios duration, nodes queue size and performance affect this. + // This tries to be fair with clients independent of their number of threads. + return Manager.getInstance().getNumWorkers() * 100; + } + + @Override + public void init(ConfigurationManager configurationManager) throws Exception { + + super.init(configurationManager); + + if (!this.isEnabled()) { + return; + } + + if (!servers.isEmpty()) { + return; + } + + boolean disable = false; + if (transcriptConfig.getRemoteService() == null) { + String ipedRoot = System.getProperty(IConfigurationDirectory.IPED_ROOT); + if (ipedRoot != null) { + Path path = new File(ipedRoot, "conf/" + AudioTranscriptConfig.CONF_FILE).toPath(); + configurationManager.getConfigurationDirectory().addPath(path); + configurationManager.addObject(transcriptConfig); + configurationManager.loadConfig(transcriptConfig); + // maybe user changed installation configs + if (transcriptConfig.getRemoteService() == null) { + disable = true; + } else { + transcriptConfig.setEnabled(true); + transcriptConfig.setClassName(this.getClass().getName()); + } + } else { + disable = true; + } + } + + if (disable) { + transcriptConfig.setEnabled(false); + logger.warn("Remote transcription module disabled, service address not configured."); + return; + } + + requestServers(true); + + } + + private static synchronized void requestServers(RemoteAudioTranscriptTask task, boolean now) throws IOException { + if (!now && System.currentTimeMillis() - lastUpdateServersTime < UPDATE_SERVERS_INTERVAL_MILLIS) { + return; + } + String[] ipAndPort = task.transcriptConfig.getRemoteService().split(":"); + String ip = ipAndPort[0]; + int port = Integer.parseInt(ipAndPort[1]); + try (Socket client = new Socket(ip, port); + InputStream is = client.getInputStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); + PrintWriter writer = new PrintWriter(new OutputStreamWriter(client.getOutputStream(), StandardCharsets.UTF_8), true)) { + + client.setSoTimeout(10000); + writer.println(MESSAGES.DISCOVER); + int numServers = Integer.parseInt(reader.readLine()); + List servers = new ArrayList<>(); + for (int i = 0; i < numServers; i++) { + String[] ipPort = reader.readLine().split(":"); + Server server = new Server(); + server.ip = ipPort[0]; + server.port = Integer.parseInt(ipPort[1]); + servers.add(server); + logger.info("Transcription server discovered: {}:{}", server.ip, server.port); + } + RemoteAudioTranscriptTask.servers = servers; + lastUpdateServersTime = System.currentTimeMillis(); + } catch (ConnectException e) { + String msg = "Central transcription node refused connection, is it online? " + e.toString(); + if (servers.isEmpty()) { + throw new IPEDException(msg); + } else { + logger.warn(msg); + } + } + } + + private void requestServers(boolean now) throws IOException { + requestServers(this, now); + } + + + @Override + public void finish() throws Exception { + super.finish(); + if (!statsPrinted.getAndSet(true)) { + int numWorkers = this.worker.manager.getNumWorkers(); + DecimalFormat df = new DecimalFormat(); + logger.info("Time spent to send audios: {}s", df.format(audioSendingTime.get() / (1000 * numWorkers))); + logger.info("Time spent to receive transcriptions: {}s", df.format(transcriptReceiveTime.get() / (1000 * numWorkers))); + } + } + + /** + * Returns a transcription server between the discovered ones using a simple + * circular approach. + * + * @return Server instance to use + */ + private static synchronized Server getServer() { + if (servers.isEmpty()) { + throw new IPEDException("No transcription server available!"); + } + currentServer++; + if (currentServer >= servers.size()) { + currentServer = 0; + } + return servers.get(currentServer); + } + + /** + * Don't convert to WAV on client side, return the audio as is. + */ + @Override + protected File getTempFileToTranscript(IItem evidence, TemporaryResources tmp) throws IOException, InterruptedException { + return evidence.getTempFile(); + } + + @Override + protected TextAndScore transcribeAudio(File tmpFile) throws Exception { + + while (true) { + requestServers(false); + Server server = getServer(); + long requestTime = System.currentTimeMillis(); + try (Socket serverSocket = new Socket(server.ip, server.port); + InputStream is = serverSocket.getInputStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); + BufferedOutputStream bos = new BufferedOutputStream(serverSocket.getOutputStream())) { + + numConnectErrors.set(0); + + int timeoutSecs = (int) (MIN_TIMEOUT + TIMEOUT_PER_MB * tmpFile.length() / (1 << 20)); + serverSocket.setSoTimeout(1000 * timeoutSecs); + + String response = reader.readLine(); + if (response == null || MESSAGES.BUSY.toString().equals(response)) { + logger.debug("Transcription server {} busy, trying another one.", server); + sleepBeforeRetry(requestTime); + continue; + } + if (!MESSAGES.ACCEPTED.toString().equals(response)) { + logger.error("Error 0 in communication with {}. The audio will be retried.", server); + continue; + } + + logger.debug("Transcription server {} accepted connection", server); + + long t0 = System.currentTimeMillis(); + + bos.write(MESSAGES.VERSION_1_2.toString().getBytes()); + // bos.write("\n".getBytes()); + + bos.write(MESSAGES.AUDIO_SIZE.toString().getBytes()); + + DataOutputStream dos = new DataOutputStream(bos); + // Must use long see #1833 + dos.writeLong(tmpFile.length()); + dos.flush(); + + Files.copy(tmpFile.toPath(), bos); + bos.flush(); + + long t1 = System.currentTimeMillis(); + + response = reader.readLine(); + + while (MESSAGES.PING.toString().equals(response)) { + logger.debug("ping {}", response); + response = reader.readLine(); + } + + if (MESSAGES.WARN.toString().equals(response)) { + String warn = reader.readLine(); + boolean tryAgain = false; + if (warn.contains(TimeoutException.class.getName())) { + // Timeout converting audio to wav, possibly it's corrupted + evidence.setTimeOut(true); + stats.incTimeouts(); + } else if (warn.contains(SocketTimeoutException.class.getName()) || warn.contains(SocketException.class.getName())) { + tryAgain = true; + } + logger.warn("Fail to transcribe on server: {} audio: {} error: {}.{}", server, evidence.getPath(), warn, (tryAgain ? " The audio will be retried." : "")); + if (tryAgain) { + continue; + } + return null; + } + if (MESSAGES.ERROR.toString().equals(response) || response == null) { + String error = response != null ? reader.readLine() : "Remote server process crashed or node was turned off!"; + logger.error("Error 1 in communication with {}: {}. The audio will be retried.", server, error); + throw new SocketException(error); + } + + TextAndScore textAndScore = new TextAndScore(); + textAndScore.score = Double.parseDouble(response); + textAndScore.text = reader.readLine(); + + long t2 = System.currentTimeMillis(); + + if (!MESSAGES.DONE.toString().equals(reader.readLine())) { + logger.error("Error 2 in communication with {}. The audio will be retried.", server); + throw new SocketException("Error receiving transcription."); + } + + audioSendingTime.addAndGet(t1 - t0); + transcriptReceiveTime.addAndGet(t2 - t1); + + return textAndScore; + + } catch (SocketTimeoutException | SocketException e) { + if (e instanceof ConnectException) { + numConnectErrors.incrementAndGet(); + if (numConnectErrors.get() / this.worker.manager.getNumWorkers() >= MAX_CONNECT_ERRORS) { + throw new TooManyConnectException(); + } + sleepBeforeRetry(requestTime); + requestServers(true); + } else { + logger.warn("Network error communicating to server: " + server + ", retrying audio: " + evidence.getPath(), e); + } + } + } + + } + + private void sleepBeforeRetry(long lastRequestTime) { + long sleep = getRetryIntervalMillis() - (System.currentTimeMillis() - lastRequestTime); + if (sleep > 0) { + try { + Thread.sleep(sleep); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + +} diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java index d7e1627af0..555256d0b4 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java @@ -1,318 +1,11 @@ package iped.engine.task.transcript; -import java.io.BufferedOutputStream; -import java.io.BufferedReader; -import java.io.DataOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.net.ConnectException; -import java.net.Socket; -import java.net.SocketException; -import java.net.SocketTimeoutException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.text.DecimalFormat; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.tika.io.TemporaryResources; - -import iped.configuration.IConfigurationDirectory; -import iped.data.IItem; -import iped.engine.config.AudioTranscriptConfig; -import iped.engine.config.ConfigurationManager; -import iped.engine.core.Manager; -import iped.engine.io.TimeoutException; -import iped.engine.task.transcript.RemoteWav2Vec2Service.MESSAGES; -import iped.exception.IPEDException; - -public class RemoteWav2Vec2TranscriptTask extends AbstractTranscriptTask { - - private static Logger logger = LogManager.getLogger(Wav2Vec2TranscriptTask.class); - - private static final int MAX_CONNECT_ERRORS = 60; - - private static final int UPDATE_SERVERS_INTERVAL_MILLIS = 60000; - - private static List servers = new ArrayList<>(); - - private static int currentServer = -1; - - private static AtomicInteger numConnectErrors = new AtomicInteger(); - - private static AtomicLong audioSendingTime = new AtomicLong(); - - private static AtomicLong transcriptReceiveTime = new AtomicLong(); - - private static AtomicBoolean statsPrinted = new AtomicBoolean(); - - private static long lastUpdateServersTime = 0; - - private static class Server { - - String ip; - int port; - - public String toString() { - return ip + ":" + port; - } - } - - // See https://github.com/sepinf-inc/IPED/issues/1576 - private int getRetryIntervalMillis() { - // This depends on how much time worker nodes need to consume their queue. - // Of course audios duration, nodes queue size and performance affect this. - // This tries to be fair with clients independent of their number of threads. - return Manager.getInstance().getNumWorkers() * 100; - } - - @Override - public void init(ConfigurationManager configurationManager) throws Exception { - - super.init(configurationManager); - - if (!this.isEnabled()) { - return; - } - - if (!servers.isEmpty()) { - return; - } - - boolean disable = false; - if (transcriptConfig.getWav2vec2Service() == null) { - String ipedRoot = System.getProperty(IConfigurationDirectory.IPED_ROOT); - if (ipedRoot != null) { - Path path = new File(ipedRoot, "conf/" + AudioTranscriptConfig.CONF_FILE).toPath(); - configurationManager.getConfigurationDirectory().addPath(path); - configurationManager.addObject(transcriptConfig); - configurationManager.loadConfig(transcriptConfig); - // maybe user changed installation configs - if (transcriptConfig.getWav2vec2Service() == null) { - disable = true; - } else { - transcriptConfig.setEnabled(true); - transcriptConfig.setClassName(this.getClass().getName()); - } - } else { - disable = true; - } - } - - if (disable) { - transcriptConfig.setEnabled(false); - logger.warn("Remote transcription module disabled, service address not configured."); - return; - } - - requestServers(true); - - } - - private static synchronized void requestServers(RemoteWav2Vec2TranscriptTask task, boolean now) throws IOException { - if (!now && System.currentTimeMillis() - lastUpdateServersTime < UPDATE_SERVERS_INTERVAL_MILLIS) { - return; - } - String[] ipAndPort = task.transcriptConfig.getWav2vec2Service().split(":"); - String ip = ipAndPort[0]; - int port = Integer.parseInt(ipAndPort[1]); - try (Socket client = new Socket(ip, port); - InputStream is = client.getInputStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); - PrintWriter writer = new PrintWriter(new OutputStreamWriter(client.getOutputStream(), StandardCharsets.UTF_8), true)) { - - client.setSoTimeout(10000); - writer.println(MESSAGES.DISCOVER); - int numServers = Integer.parseInt(reader.readLine()); - List servers = new ArrayList<>(); - for (int i = 0; i < numServers; i++) { - String[] ipPort = reader.readLine().split(":"); - Server server = new Server(); - server.ip = ipPort[0]; - server.port = Integer.parseInt(ipPort[1]); - servers.add(server); - logger.info("Transcription server discovered: {}:{}", server.ip, server.port); - } - RemoteWav2Vec2TranscriptTask.servers = servers; - lastUpdateServersTime = System.currentTimeMillis(); - } catch (ConnectException e) { - String msg = "Central transcription node refused connection, is it online? " + e.toString(); - if (servers.isEmpty()) { - throw new IPEDException(msg); - } else { - logger.warn(msg); - } - } - } - - private void requestServers(boolean now) throws IOException { - requestServers(this, now); - } - - - @Override - public void finish() throws Exception { - super.finish(); - if (!statsPrinted.getAndSet(true)) { - int numWorkers = this.worker.manager.getNumWorkers(); - DecimalFormat df = new DecimalFormat(); - logger.info("Time spent to send audios: {}s", df.format(audioSendingTime.get() / (1000 * numWorkers))); - logger.info("Time spent to receive transcriptions: {}s", df.format(transcriptReceiveTime.get() / (1000 * numWorkers))); - } - } - - /** - * Returns a transcription server between the discovered ones using a simple - * circular approach. - * - * @return Server instance to use - */ - private static synchronized Server getServer() { - if (servers.isEmpty()) { - throw new IPEDException("No transcription server available!"); - } - currentServer++; - if (currentServer >= servers.size()) { - currentServer = 0; - } - return servers.get(currentServer); - } - - /** - * Don't convert to WAV on client side, return the audio as is. - */ - @Override - protected File getTempFileToTranscript(IItem evidence, TemporaryResources tmp) throws IOException, InterruptedException { - return evidence.getTempFile(); - } - - @Override - protected TextAndScore transcribeAudio(File tmpFile) throws Exception { - - while (true) { - requestServers(false); - Server server = getServer(); - long requestTime = System.currentTimeMillis(); - try (Socket serverSocket = new Socket(server.ip, server.port); - InputStream is = serverSocket.getInputStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); - BufferedOutputStream bos = new BufferedOutputStream(serverSocket.getOutputStream())) { - - numConnectErrors.set(0); - - int timeoutSecs = (int) (MIN_TIMEOUT + TIMEOUT_PER_MB * tmpFile.length() / (1 << 20)); - serverSocket.setSoTimeout(1000 * timeoutSecs); - - String response = reader.readLine(); - if (response == null || MESSAGES.BUSY.toString().equals(response)) { - logger.debug("Transcription server {} busy, trying another one.", server); - sleepBeforeRetry(requestTime); - continue; - } - if (!MESSAGES.ACCEPTED.toString().equals(response)) { - logger.error("Error 0 in communication with {}. The audio will be retried.", server); - continue; - } - - logger.debug("Transcription server {} accepted connection", server); - - long t0 = System.currentTimeMillis(); - - bos.write(MESSAGES.VERSION_1_2.toString().getBytes()); - // bos.write("\n".getBytes()); - - bos.write(MESSAGES.AUDIO_SIZE.toString().getBytes()); - - DataOutputStream dos = new DataOutputStream(bos); - // Must use long see #1833 - dos.writeLong(tmpFile.length()); - dos.flush(); - - Files.copy(tmpFile.toPath(), bos); - bos.flush(); - - long t1 = System.currentTimeMillis(); - - response = reader.readLine(); - - while (MESSAGES.PING.toString().equals(response)) { - logger.debug("ping {}", response); - response = reader.readLine(); - } - - if (MESSAGES.WARN.toString().equals(response)) { - String warn = reader.readLine(); - boolean tryAgain = false; - if (warn.contains(TimeoutException.class.getName())) { - // Timeout converting audio to wav, possibly it's corrupted - evidence.setTimeOut(true); - stats.incTimeouts(); - } else if (warn.contains(SocketTimeoutException.class.getName()) || warn.contains(SocketException.class.getName())) { - tryAgain = true; - } - logger.warn("Fail to transcribe on server: {} audio: {} error: {}.{}", server, evidence.getPath(), warn, (tryAgain ? " The audio will be retried." : "")); - if (tryAgain) { - continue; - } - return null; - } - if (MESSAGES.ERROR.toString().equals(response) || response == null) { - String error = response != null ? reader.readLine() : "Remote server process crashed or node was turned off!"; - logger.error("Error 1 in communication with {}: {}. The audio will be retried.", server, error); - throw new SocketException(error); - } - - TextAndScore textAndScore = new TextAndScore(); - textAndScore.score = Double.parseDouble(response); - textAndScore.text = reader.readLine(); - - long t2 = System.currentTimeMillis(); - - if (!MESSAGES.DONE.toString().equals(reader.readLine())) { - logger.error("Error 2 in communication with {}. The audio will be retried.", server); - throw new SocketException("Error receiving transcription."); - } - - audioSendingTime.addAndGet(t1 - t0); - transcriptReceiveTime.addAndGet(t2 - t1); - - return textAndScore; - - } catch (SocketTimeoutException | SocketException e) { - if (e instanceof ConnectException) { - numConnectErrors.incrementAndGet(); - if (numConnectErrors.get() / this.worker.manager.getNumWorkers() >= MAX_CONNECT_ERRORS) { - throw new TooManyConnectException(); - } - sleepBeforeRetry(requestTime); - requestServers(true); - } else { - logger.warn("Network error communicating to server: " + server + ", retrying audio: " + evidence.getPath(), e); - } - } - } - - } - - private void sleepBeforeRetry(long lastRequestTime) { - long sleep = getRetryIntervalMillis() - (System.currentTimeMillis() - lastRequestTime); - if (sleep > 0) { - try { - Thread.sleep(sleep); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } +/** + * Used just for backwards compatibility with old config files. + * + * @author Nassif + * + */ +public class RemoteWav2Vec2TranscriptTask extends RemoteAudioTranscriptTask { } From 3e4ba415dd68fc29c7f68273886cc87f60cede7a Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 20:27:30 -0300 Subject: [PATCH 04/31] '#1823: make private methods protected, make inner class package visible --- .../task/transcript/Wav2Vec2TranscriptTask.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java index ea4f50bc4c..8cd05ae366 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java @@ -36,20 +36,20 @@ public class Wav2Vec2TranscriptTask extends AbstractTranscriptTask { private static final int MAX_TRANSCRIPTIONS = 100000; private static final byte[] NEW_LINE = "\n".getBytes(); - private static volatile Integer numProcesses; + protected static volatile Integer numProcesses; private static LinkedBlockingDeque deque = new LinkedBlockingDeque<>(); private static volatile Level logLevel = Level.forName("MSG", 250); - private static class Server { + static class Server { Process process; BufferedReader reader; int transcriptionsDone = 0; int device = 0; } - private static int getNumProcessors() { + protected static int getNumProcessors() { SystemInfo si = new SystemInfo(); HardwareAbstractionLayer hal = si.getHardware(); CentralProcessor cpu = hal.getProcessor(); @@ -96,7 +96,7 @@ public void init(ConfigurationManager configurationManager) throws Exception { } - private Server startServer(int device) throws StartupException { + protected Server startServer(int device) throws StartupException { try { return startServer0(device); } catch (Exception e) { @@ -109,7 +109,7 @@ private Server startServer(int device) throws StartupException { } } - private Server startServer0(int device) throws IOException { + protected Server startServer0(int device) throws IOException { if (numProcesses != null && device == numProcesses) { return null; } @@ -172,7 +172,7 @@ private Server startServer0(int device) throws IOException { return server; } - private void logInputStream(InputStream is) { + protected void logInputStream(InputStream is) { Thread t = new Thread() { public void run() { byte[] buf = new byte[1024]; From 20aeff970c447dd2318bef4d35c038d7cbef870c Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 20:31:16 -0300 Subject: [PATCH 05/31] '#1823: new Whisper process python service --- .../resources/scripts/tasks/WhisperProcess.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 iped-app/resources/scripts/tasks/WhisperProcess.py diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py new file mode 100644 index 0000000000..e3bfb0fd55 --- /dev/null +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -0,0 +1,88 @@ +import sys +import numpy +stdout = sys.stdout +sys.stdout = sys.stderr + +terminate = 'terminate_process' +model_loaded = 'model_loaded' +library_loaded = 'library_loaded' +finished = 'transcription_finished' +ping = 'ping' + +def main(): + + modelName = sys.argv[1] + deviceNum = int(sys.argv[2]) + threads = int(sys.argv[3]) + language = sys.argv[4] + + if language == 'auto': + language = None + + from faster_whisper import WhisperModel + + print(library_loaded, file=stdout, flush=True) + + import torch + cudaCount = torch.cuda.device_count() + + print(str(cudaCount), file=stdout, flush=True) + + if cudaCount > 0: + deviceId = 'cuda' + else: + deviceId = 'cpu' + deviceNum = 0 + + try: + model = WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type="int8") + + except Exception as e: + if deviceId != 'cpu': + # loading on GPU failed (OOM?), try on CPU + deviceId = 'cpu' + model = WhisperModel(model_size_or_path=modelName, device=deviceId, cpu_threads=threads, compute_type="int8") + else: + raise e + + print(model_loaded, file=stdout, flush=True) + print(deviceId, file=stdout, flush=True) + + while True: + + line = input() + + if line == terminate: + break + if line == ping: + print(ping, file=stdout, flush=True) + continue + + transcription = '' + probs = [] + try: + segments, info = model.transcribe(audio=line, language=language, beam_size=5, word_timestamps=True) + for segment in segments: + transcription += segment.text + words = segment.words + if words is not None: + probs += [word.probability for word in words] + + except Exception as e: + msg = repr(e).replace('\n', ' ').replace('\r', ' ') + print(msg, file=stdout, flush=True) + continue + + text = transcription.replace('\n', ' ').replace('\r', ' ') + + probs = probs if len(probs) != 0 else [0] + finalScore = numpy.average(probs) + + print(finished, file=stdout, flush=True) + print(str(finalScore), file=stdout, flush=True) + print(text, file=stdout, flush=True) + + return + +if __name__ == "__main__": + main() From d2e1d7035dfd0935ddaa030a7ccadc75636fb2ac Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 20:32:27 -0300 Subject: [PATCH 06/31] '#1823: new WhisperTranscriptTask communicating with the python process --- .../transcript/WhisperTranscriptTask.java | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java new file mode 100644 index 0000000000..c6659be0bc --- /dev/null +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -0,0 +1,88 @@ +package iped.engine.task.transcript; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.commons.lang3.SystemUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import iped.configuration.IConfigurationDirectory; +import iped.engine.config.AudioTranscriptConfig; +import iped.engine.config.Configuration; + +public class WhisperTranscriptTask extends Wav2Vec2TranscriptTask { + + private static Logger logger = LogManager.getLogger(Wav2Vec2TranscriptTask.class); + + private static final String SCRIPT_PATH = "/scripts/tasks/WhisperProcess.py"; + private static final String LIBRARY_LOADED = "library_loaded"; + private static final String MODEL_LOADED = "model_loaded"; + + @Override + protected Server startServer0(int device) throws IOException { + if (numProcesses != null && device == numProcesses) { + return null; + } + ProcessBuilder pb = new ProcessBuilder(); + String ipedRoot = System.getProperty(IConfigurationDirectory.IPED_ROOT); + if (ipedRoot == null) { + ipedRoot = Configuration.getInstance().appRoot; + } + String python = SystemUtils.IS_OS_WINDOWS ? ipedRoot + "/python/python.exe" : "python3"; + String script = ipedRoot + SCRIPT_PATH; + String model = super.transcriptConfig.getWhisperModel(); + if (model == null) { + throw new StartupException("You must configure '" + AudioTranscriptConfig.WHISPER_MODEL + "' in audio transcription config file."); + } + + int cpus = getNumProcessors(); + int threads = Runtime.getRuntime().availableProcessors() / cpus; + + pb.command(python, script, model, Integer.toString(device), Integer.toString(threads), transcriptConfig.getLanguages().get(0)); + + Process process = pb.start(); + + logInputStream(process.getErrorStream()); + + BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream())); + + String line = reader.readLine(); + + if (!LIBRARY_LOADED.equals(line)) { + throw new StartupException("'faster_whisper' python lib not loaded correctly. Have you installed it?"); + } + + int cudaCount = Integer.valueOf(reader.readLine()); + if (numProcesses == null) { + logger.info("Number of CUDA devices detected: {}", cudaCount); + logger.info("Number of CPU devices detected: {}", cpus); + if (cudaCount > 0) { + numProcesses = cudaCount; + } else { + numProcesses = cpus; + } + } + + String msgToIgnore = "Ignored unknown"; + while ((line = reader.readLine()) != null && line.startsWith(msgToIgnore)) + ; + + if (!MODEL_LOADED.equals(line)) { + throw new StartupException("Error loading '" + model + "' transcription model."); + } + + line = reader.readLine(); + + logger.info("Model loaded on device={}", line); + + Server server = new Server(); + server.process = process; + server.reader = reader; + server.device = device; + + return server; + } + +} From 147cdf19493a2af7581611c0757275fae57627da Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 21:10:12 -0300 Subject: [PATCH 07/31] '#1823: fix a typo --- iped-app/resources/config/conf/AudioTranscriptConfig.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index fc62f42d7b..64e3da2215 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -15,7 +15,7 @@ implementationClass = iped.engine.task.transcript.VoskTranscriptTask # If you enable this, you must set 'huggingFaceModel' param below. #implementationClass = iped.engine.task.transcript.Wav2Vec2TranscriptTask -# Uses a local Whiper implementation for transcription. Accuracy is better than wav2vec2 depending on the model. +# Uses a local Whisper implementation for transcription. Accuracy is better than wav2vec2 depending on the model. # This is slower than wav2vec2 depending on the model. Using a very good GPU is highly recommended! # Please check the installation steps: https://github.com/sepinf-inc/IPED/wiki/User-Manual#whisper # If you enable this, you must set 'whisperModel' param below. From 71e125abc19509b203839e94683d8d2f6345d457 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 21:11:54 -0300 Subject: [PATCH 08/31] '#1823: convert UI language to whisper supported language format --- .../iped/engine/task/transcript/WhisperTranscriptTask.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index c6659be0bc..ba5f29e2d7 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -40,7 +40,12 @@ protected Server startServer0(int device) throws IOException { int cpus = getNumProcessors(); int threads = Runtime.getRuntime().availableProcessors() / cpus; - pb.command(python, script, model, Integer.toString(device), Integer.toString(threads), transcriptConfig.getLanguages().get(0)); + String lang = transcriptConfig.getLanguages().get(0); + if (lang.contains("-")) { + lang = lang.substring(0, lang.indexOf("-")); + } + + pb.command(python, script, model, Integer.toString(device), Integer.toString(threads), lang); Process process = pb.start(); From 2d0332fe339426fc135fee2ab539f659f9fbc1f1 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Fri, 12 Apr 2024 21:24:18 -0300 Subject: [PATCH 09/31] '#1823: allow language auto detection configuration --- iped-app/resources/config/conf/AudioTranscriptConfig.txt | 4 +++- iped-app/resources/scripts/tasks/WhisperProcess.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 64e3da2215..7f3da09206 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -105,7 +105,9 @@ minWordScore = 0.5 # Possible values: tiny, base, small, medium, large-v3 # large-v3 is much better than medium, but 2x slower and uses 2x more memory. -# If you know the language you want to transcribe, please set the 'language' option above. Auto detection causes mistakes. +# If you know the language you want to transcribe, please set the 'language' option above. +# 'language = auto' uses the 'locale' set on LocalConfig.txt +# 'language = detect' uses auto detection, but it can cause mistakes whisperModel = medium ######################################### diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index e3bfb0fd55..a81c062f01 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -16,7 +16,7 @@ def main(): threads = int(sys.argv[3]) language = sys.argv[4] - if language == 'auto': + if language == 'detect': language = None from faster_whisper import WhisperModel From 7177a911516e1a7a7cc5762d372cbc2db92afc91 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 13 Apr 2024 15:41:45 -0300 Subject: [PATCH 10/31] '#1823: uses a much smaller dependency to get number of GPUs --- iped-app/resources/scripts/tasks/WhisperProcess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index a81c062f01..236fa4751d 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -23,8 +23,8 @@ def main(): print(library_loaded, file=stdout, flush=True) - import torch - cudaCount = torch.cuda.device_count() + import GPUtil + cudaCount = len(GPUtil.getGPUs()) print(str(cudaCount), file=stdout, flush=True) From 71df157ead81cfa01e272ced7ba2f7df11a3f919 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 13 Apr 2024 15:47:27 -0300 Subject: [PATCH 11/31] '#1823: rename remote transcript classes to be implementation decoupled --- iped-app/resources/config/conf/AudioTranscriptConfig.txt | 2 +- ...c2Discovery.java => RemoteTranscriptionDiscovery.java} | 4 ++-- ...v2Vec2Service.java => RemoteTranscriptionService.java} | 6 +++--- ...ioTranscriptTask.java => RemoteTranscriptionTask.java} | 8 ++++---- .../task/transcript/RemoteWav2Vec2TranscriptTask.java | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) rename iped-engine/src/main/java/iped/engine/task/transcript/{RemoteWav2Vec2Discovery.java => RemoteTranscriptionDiscovery.java} (99%) rename iped-engine/src/main/java/iped/engine/task/transcript/{RemoteWav2Vec2Service.java => RemoteTranscriptionService.java} (99%) rename iped-engine/src/main/java/iped/engine/task/transcript/{RemoteAudioTranscriptTask.java => RemoteTranscriptionTask.java} (97%) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 7f3da09206..12fef53f2c 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -25,7 +25,7 @@ implementationClass = iped.engine.task.transcript.VoskTranscriptTask # The remote service is useful if you have a central server/cluster with many GPUs to be shared among processing nodes. # Please check steps on https://github.com/sepinf-inc/IPED/wiki/User-Manual#wav2vec2 # If enable this, you must set 'remoteServiceAddress' param below. -#implementationClass = iped.engine.task.transcript.RemoteAudioTranscriptTask +#implementationClass = iped.engine.task.transcript.RemoteTranscriptionTask # If you want to use the Microsoft Azure service implementation, comment above and uncomment below. # You MUST include Microsoft client-sdk.jar into plugins folder. diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2Discovery.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionDiscovery.java similarity index 99% rename from iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2Discovery.java rename to iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionDiscovery.java index 01c69855d6..4e07764b6a 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2Discovery.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionDiscovery.java @@ -20,9 +20,9 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; -import iped.engine.task.transcript.RemoteWav2Vec2Service.MESSAGES; +import iped.engine.task.transcript.RemoteTranscriptionService.MESSAGES; -public class RemoteWav2Vec2Discovery { +public class RemoteTranscriptionDiscovery { private static final File statsFile = new File(System.getProperty("user.home"), "transcription.stats"); diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2Service.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java similarity index 99% rename from iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2Service.java rename to iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java index 93d436f984..91fa0b00db 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2Service.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java @@ -35,7 +35,7 @@ import iped.io.URLUtil; import iped.utils.IOUtil; -public class RemoteWav2Vec2Service { +public class RemoteTranscriptionService { static enum MESSAGES { ACCEPTED, @@ -131,11 +131,11 @@ public static void main(String[] args) throws Exception { printHelpAndExit(); } - File jar = new File(URLUtil.getURL(RemoteWav2Vec2Service.class).toURI()); + File jar = new File(URLUtil.getURL(RemoteTranscriptionService.class).toURI()); File root = jar.getParentFile().getParentFile(); System.setProperty("org.apache.logging.log4j.level", "INFO"); - logger = LoggerFactory.getLogger(RemoteWav2Vec2Service.class); + logger = LoggerFactory.getLogger(RemoteTranscriptionService.class); Configuration.getInstance().loadConfigurables(root.getAbsolutePath()); ConfigurationManager cm = ConfigurationManager.get(); diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionTask.java similarity index 97% rename from iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java rename to iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionTask.java index 770242a248..8c321db98f 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteAudioTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionTask.java @@ -33,10 +33,10 @@ import iped.engine.config.ConfigurationManager; import iped.engine.core.Manager; import iped.engine.io.TimeoutException; -import iped.engine.task.transcript.RemoteWav2Vec2Service.MESSAGES; +import iped.engine.task.transcript.RemoteTranscriptionService.MESSAGES; import iped.exception.IPEDException; -public class RemoteAudioTranscriptTask extends AbstractTranscriptTask { +public class RemoteTranscriptionTask extends AbstractTranscriptTask { private static Logger logger = LogManager.getLogger(Wav2Vec2TranscriptTask.class); @@ -119,7 +119,7 @@ public void init(ConfigurationManager configurationManager) throws Exception { } - private static synchronized void requestServers(RemoteAudioTranscriptTask task, boolean now) throws IOException { + private static synchronized void requestServers(RemoteTranscriptionTask task, boolean now) throws IOException { if (!now && System.currentTimeMillis() - lastUpdateServersTime < UPDATE_SERVERS_INTERVAL_MILLIS) { return; } @@ -143,7 +143,7 @@ private static synchronized void requestServers(RemoteAudioTranscriptTask task, servers.add(server); logger.info("Transcription server discovered: {}:{}", server.ip, server.port); } - RemoteAudioTranscriptTask.servers = servers; + RemoteTranscriptionTask.servers = servers; lastUpdateServersTime = System.currentTimeMillis(); } catch (ConnectException e) { String msg = "Central transcription node refused connection, is it online? " + e.toString(); diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java index 555256d0b4..eda8b715de 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteWav2Vec2TranscriptTask.java @@ -6,6 +6,6 @@ * @author Nassif * */ -public class RemoteWav2Vec2TranscriptTask extends RemoteAudioTranscriptTask { +public class RemoteWav2Vec2TranscriptTask extends RemoteTranscriptionTask { } From 53f80a6267057ca3398e48b637dab656785d825b Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 13 Apr 2024 16:17:03 -0300 Subject: [PATCH 12/31] '#1823: makes remote transcription load implementation class from config --- .../engine/task/transcript/RemoteTranscriptionService.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java index 91fa0b00db..39f974ddf1 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/RemoteTranscriptionService.java @@ -143,10 +143,10 @@ public static void main(String[] args) throws Exception { LocalConfig localConfig = new LocalConfig(); cm.addObject(audioConfig); cm.addObject(localConfig); - cm.loadConfig(audioConfig); cm.loadConfig(localConfig); + cm.loadConfig(audioConfig); - Wav2Vec2TranscriptTask task = new Wav2Vec2TranscriptTask(); + AbstractTranscriptTask task = (AbstractTranscriptTask) Class.forName(audioConfig.getClassName()).getDeclaredConstructor().newInstance(); audioConfig.setEnabled(true); task.init(cm); @@ -261,7 +261,7 @@ private static void removeFrombeaconQueq(OpenConnectons opc) { } } - private static void waitRequests(ServerSocket server, Wav2Vec2TranscriptTask task, String discoveryIp) { + private static void waitRequests(ServerSocket server, AbstractTranscriptTask task, String discoveryIp) { AtomicInteger jobs = new AtomicInteger(); while (true) { try { From cc7b4955854aef854a07054b5f901090b0da0afe Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 13 Apr 2024 19:14:23 -0300 Subject: [PATCH 13/31] '#1823: update config file comments --- iped-app/resources/config/conf/AudioTranscriptConfig.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 12fef53f2c..2752f300c1 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -10,21 +10,21 @@ implementationClass = iped.engine.task.transcript.VoskTranscriptTask # Uses a local wav2vec2 implementation for transcription. Accuracy is much better than most Vosk models. -# This is AT LEAST 1 order of magnitude slower than Vosk on high end CPUs. Using a good GPU is highly recommended! +# This is up to 10x slower than Vosk on high end CPUs. Using a good GPU is highly recommended! # Please check the installation steps: https://github.com/sepinf-inc/IPED/wiki/User-Manual#wav2vec2 # If you enable this, you must set 'huggingFaceModel' param below. #implementationClass = iped.engine.task.transcript.Wav2Vec2TranscriptTask # Uses a local Whisper implementation for transcription. Accuracy is better than wav2vec2 depending on the model. -# This is slower than wav2vec2 depending on the model. Using a very good GPU is highly recommended! +# This is up to 4x slower than wav2vec2 depending on compared models. Using a high end GPU is strongly recommended! # Please check the installation steps: https://github.com/sepinf-inc/IPED/wiki/User-Manual#whisper # If you enable this, you must set 'whisperModel' param below. #implementationClass = iped.engine.task.transcript.WhisperTranscriptTask # Uses a remote service for transcription. # The remote service is useful if you have a central server/cluster with many GPUs to be shared among processing nodes. -# Please check steps on https://github.com/sepinf-inc/IPED/wiki/User-Manual#wav2vec2 -# If enable this, you must set 'remoteServiceAddress' param below. +# Please check steps on https://github.com/sepinf-inc/IPED/wiki/User-Manual#remote-transcription +# If you enable this, you must set 'remoteServiceAddress' param below. #implementationClass = iped.engine.task.transcript.RemoteTranscriptionTask # If you want to use the Microsoft Azure service implementation, comment above and uncomment below. From b6ec69d8eba996b4315176b64896a05d0f9af253 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Mon, 15 Apr 2024 19:31:43 -0300 Subject: [PATCH 14/31] '#1823: use float16, not int8, for better precision and ~50% more speed --- iped-app/resources/scripts/tasks/WhisperProcess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index 236fa4751d..984e772fbf 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -35,13 +35,13 @@ def main(): deviceNum = 0 try: - model = WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type="int8") + model = WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type="float16") except Exception as e: if deviceId != 'cpu': # loading on GPU failed (OOM?), try on CPU deviceId = 'cpu' - model = WhisperModel(model_size_or_path=modelName, device=deviceId, cpu_threads=threads, compute_type="int8") + model = WhisperModel(model_size_or_path=modelName, device=deviceId, cpu_threads=threads, compute_type="float16") else: raise e From c5599106c9d953e082b7a463af3ad746c37452e0 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Mon, 15 Apr 2024 20:10:58 -0300 Subject: [PATCH 15/31] '#1823: use numpy.mean instead of numpy.average (by @gfd2020) --- iped-app/resources/scripts/tasks/WhisperProcess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index 984e772fbf..f14f92fdc2 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -76,7 +76,7 @@ def main(): text = transcription.replace('\n', ' ').replace('\r', ' ') probs = probs if len(probs) != 0 else [0] - finalScore = numpy.average(probs) + finalScore = numpy.mean(probs) print(finished, file=stdout, flush=True) print(str(finalScore), file=stdout, flush=True) From 06cc625d90869b1700ec070b3541ce7b0cd7a6f8 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Mon, 15 Apr 2024 22:09:24 -0300 Subject: [PATCH 16/31] '#1823: fix commit b6ec69d: uses float16 just for gpu, int8 for cpu --- iped-app/resources/scripts/tasks/WhisperProcess.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index f14f92fdc2..dacd3d4673 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -28,20 +28,23 @@ def main(): print(str(cudaCount), file=stdout, flush=True) + compute_type = 'int8' if cudaCount > 0: deviceId = 'cuda' + compute_type = 'float16' else: deviceId = 'cpu' deviceNum = 0 try: - model = WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type="float16") + model = WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type=compute_type) except Exception as e: if deviceId != 'cpu': # loading on GPU failed (OOM?), try on CPU deviceId = 'cpu' - model = WhisperModel(model_size_or_path=modelName, device=deviceId, cpu_threads=threads, compute_type="float16") + compute_type = 'int8' + model = WhisperModel(model_size_or_path=modelName, device=deviceId, cpu_threads=threads, compute_type=compute_type) else: raise e From f094f733c5eaf6534cd08c2cbd82df79a70fea00 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 27 Apr 2024 22:30:42 -0300 Subject: [PATCH 17/31] '#1823: change code to use WhisperX instead of Faster-Whisper --- .../resources/scripts/tasks/WhisperProcess.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index dacd3d4673..7f32f9b96b 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -19,7 +19,7 @@ def main(): if language == 'detect': language = None - from faster_whisper import WhisperModel + import whisperx print(library_loaded, file=stdout, flush=True) @@ -37,7 +37,7 @@ def main(): deviceNum = 0 try: - model = WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type=compute_type) + model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type) except Exception as e: if deviceId != 'cpu': @@ -62,14 +62,14 @@ def main(): continue transcription = '' - probs = [] + logprobs = [] try: - segments, info = model.transcribe(audio=line, language=language, beam_size=5, word_timestamps=True) - for segment in segments: - transcription += segment.text - words = segment.words - if words is not None: - probs += [word.probability for word in words] + audio = whisperx.load_audio(line) + result = model.transcribe(audio, batch_size=8, language=language) + for segment in result['segments']: + transcription += segment['text'] + if 'avg_logprob' in segment: + logprobs.append(segment['avg_logprob']) except Exception as e: msg = repr(e).replace('\n', ' ').replace('\r', ' ') @@ -78,8 +78,10 @@ def main(): text = transcription.replace('\n', ' ').replace('\r', ' ') - probs = probs if len(probs) != 0 else [0] - finalScore = numpy.mean(probs) + if len(logprobs) == 0: + logprobs = [0] + + finalScore = numpy.mean(numpy.exp(logprobs)) print(finished, file=stdout, flush=True) print(str(finalScore), file=stdout, flush=True) From 67e5342282561ffa9f538886ebb6c94678b12559 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Thu, 25 Apr 2024 16:23:09 -0300 Subject: [PATCH 18/31] '#1823: don't break audios in 59s to benefit from batching long audios --- .../iped/engine/task/transcript/WhisperTranscriptTask.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index ba5f29e2d7..3765eb2b5f 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -1,6 +1,7 @@ package iped.engine.task.transcript; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.InputStreamReader; @@ -90,4 +91,9 @@ protected Server startServer0(int device) throws IOException { return server; } + @Override + protected TextAndScore transcribeAudio(File tmpFile) throws Exception { + return transcribeWavPart(tmpFile); + } + } From 231b85de623605cb7dc2a3ff2777f0426bde2f57 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 27 Apr 2024 22:50:55 -0300 Subject: [PATCH 19/31] '#1823: fix probability computation when there are no results --- iped-app/resources/scripts/tasks/WhisperProcess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index 7f32f9b96b..71b410e8f4 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -79,9 +79,9 @@ def main(): text = transcription.replace('\n', ' ').replace('\r', ' ') if len(logprobs) == 0: - logprobs = [0] - - finalScore = numpy.mean(numpy.exp(logprobs)) + finalScore = 0 + else: + finalScore = numpy.mean(numpy.exp(logprobs)) print(finished, file=stdout, flush=True) print(str(finalScore), file=stdout, flush=True) From ca30e57f122f272eb95fc4a37eaad217989d3d54 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 11:41:16 -0300 Subject: [PATCH 20/31] '#1823: update library name in error message --- .../java/iped/engine/task/transcript/WhisperTranscriptTask.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index 3765eb2b5f..d4d05744d0 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -57,7 +57,7 @@ protected Server startServer0(int device) throws IOException { String line = reader.readLine(); if (!LIBRARY_LOADED.equals(line)) { - throw new StartupException("'faster_whisper' python lib not loaded correctly. Have you installed it?"); + throw new StartupException("'whisperx' python lib not loaded correctly. Have you installed it?"); } int cudaCount = Integer.valueOf(reader.readLine()); From 4f7fcf39ff81eeb6076a878361738a9a4ec997fa Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 11:43:45 -0300 Subject: [PATCH 21/31] '#1823: fix fallback code to use the same lib, add a warning message --- iped-app/resources/scripts/tasks/WhisperProcess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index 71b410e8f4..f0f68c5ffe 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -42,9 +42,10 @@ def main(): except Exception as e: if deviceId != 'cpu': # loading on GPU failed (OOM?), try on CPU + print('FAILED to load model on GPU, fallbacking to CPU!', file=sys.stderr) deviceId = 'cpu' compute_type = 'int8' - model = WhisperModel(model_size_or_path=modelName, device=deviceId, cpu_threads=threads, compute_type=compute_type) + model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type) else: raise e From 8b23384840b5eb18b3e35a9dc4596ad456a03d8e Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 16:35:29 -0300 Subject: [PATCH 22/31] '#1823: redirect warmless console messages to log --- .../resources/scripts/tasks/WhisperProcess.py | 4 +- .../transcript/Wav2Vec2TranscriptTask.java | 2 +- .../transcript/WhisperTranscriptTask.java | 40 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index f0f68c5ffe..8e2b6456b1 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -37,7 +37,7 @@ def main(): deviceNum = 0 try: - model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type) + model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) except Exception as e: if deviceId != 'cpu': @@ -45,7 +45,7 @@ def main(): print('FAILED to load model on GPU, fallbacking to CPU!', file=sys.stderr) deviceId = 'cpu' compute_type = 'int8' - model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type) + model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) else: raise e diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java index 8cd05ae366..84a92dca8a 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/Wav2Vec2TranscriptTask.java @@ -40,7 +40,7 @@ public class Wav2Vec2TranscriptTask extends AbstractTranscriptTask { private static LinkedBlockingDeque deque = new LinkedBlockingDeque<>(); - private static volatile Level logLevel = Level.forName("MSG", 250); + protected static volatile Level logLevel = Level.forName("MSG", 250); static class Server { Process process; diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index d4d05744d0..6a77455cca 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -3,7 +3,10 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.List; import org.apache.commons.lang3.SystemUtils; import org.apache.logging.log4j.LogManager; @@ -96,4 +99,41 @@ protected TextAndScore transcribeAudio(File tmpFile) throws Exception { return transcribeWavPart(tmpFile); } + @Override + protected void logInputStream(InputStream is) { + List ignoreMsgs = Arrays.asList( + "With dispatcher enabled, this function is no-op. You can remove the function call.", + "torchvision is not available - cannot save figures", + "Lightning automatically upgraded your loaded checkpoint from", + "Model was trained with pyannote.audio 0.0.1, yours is", + "Model was trained with torch 1.10.0+cu102, yours is"); + Thread t = new Thread() { + public void run() { + byte[] buf = new byte[1024]; + int read = 0; + try { + while ((read = is.read(buf)) != -1) { + String msg = new String(buf, 0, read).trim(); + boolean ignore = false; + for (String i : ignoreMsgs) { + if (msg.contains(i)) { + ignore = true; + break; + } + } + if (ignore) { + logger.warn(msg); + } else { + logger.log(logLevel, msg); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + } + }; + t.setDaemon(true); + t.start(); + } + } From abb74fb333a7d400e6073c5ca75f392d33d66f4a Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 17:52:37 -0300 Subject: [PATCH 23/31] '#1823: externalize batchSize and precision (compute_type) params --- .../config/conf/AudioTranscriptConfig.txt | 9 ++++++++ .../resources/scripts/tasks/WhisperProcess.py | 9 ++++---- .../engine/config/AudioTranscriptConfig.java | 21 +++++++++++++++++++ .../transcript/WhisperTranscriptTask.java | 5 ++++- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 2752f300c1..2dd98e6310 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -110,6 +110,15 @@ minWordScore = 0.5 # 'language = detect' uses auto detection, but it can cause mistakes whisperModel = medium +# Compute type precision. This affects accuracy, speed and memory usage. +# Possible values: float32, float16 (GPU only), int8 +precision = float32 + +# Batch size (number of parallel transcriptions). If you have a GPU with enough memory, +# increasing this value to e.g. 16 can speed up transcribing long audios up to 10x. +# Test what is the better value for your GPU before hitting OOM. +batchSize = 1 + ######################################### # RemoteAudioTranscriptTask options ######################################### diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index 8e2b6456b1..073b39297c 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -15,6 +15,8 @@ def main(): deviceNum = int(sys.argv[2]) threads = int(sys.argv[3]) language = sys.argv[4] + compute_type = sys.argv[5] + batch_size = int(sys.argv[6]) if language == 'detect': language = None @@ -28,10 +30,8 @@ def main(): print(str(cudaCount), file=stdout, flush=True) - compute_type = 'int8' if cudaCount > 0: deviceId = 'cuda' - compute_type = 'float16' else: deviceId = 'cpu' deviceNum = 0 @@ -44,7 +44,8 @@ def main(): # loading on GPU failed (OOM?), try on CPU print('FAILED to load model on GPU, fallbacking to CPU!', file=sys.stderr) deviceId = 'cpu' - compute_type = 'int8' + if compute_type == 'float16': # not supported on CPU + compute_type = 'float32' model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) else: raise e @@ -66,7 +67,7 @@ def main(): logprobs = [] try: audio = whisperx.load_audio(line) - result = model.transcribe(audio, batch_size=8, language=language) + result = model.transcribe(audio, batch_size=batch_size, language=language) for segment in result['segments']: transcription += segment['text'] if 'avg_logprob' in segment: diff --git a/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java b/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java index e950824bb2..b7ea08e500 100644 --- a/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java +++ b/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java @@ -33,6 +33,8 @@ public class AudioTranscriptConfig extends AbstractTaskPropertiesConfig { private static final String GOOGLE_MODEL = "googleModel"; private static final String LANG_AUTO_VAL = "auto"; private static final String SKIP_KNOWN_FILES = "skipKnownFiles"; + private static final String PRECISION = "precision"; + private static final String BATCH_SIZE = "batchSize"; private List languages = new ArrayList<>(); private List mimesToProcess = new ArrayList<>(); @@ -49,6 +51,16 @@ public class AudioTranscriptConfig extends AbstractTaskPropertiesConfig { private String remoteService; private String googleModel; private boolean skipKnownFiles = true; + private String precision = "float32"; + private int batchSize = 1; + + public String getPrecision() { + return precision; + } + + public int getBatchSize() { + return batchSize; + } public boolean getSkipKnownFiles() { return this.skipKnownFiles; @@ -155,6 +167,7 @@ public void processProperties(UTF8Properties properties) { if (whisperModel != null) { whisperModel = whisperModel.strip(); } + remoteService = properties.getProperty(REMOTE_SERVICE); if (remoteService == null) { remoteService = properties.getProperty(WAV2VEC2_SERVICE); @@ -179,6 +192,14 @@ public void processProperties(UTF8Properties properties) { if (value != null) { timeoutPerSec = Integer.valueOf(value.trim()); } + value = properties.getProperty(PRECISION); + if (value != null) { + precision = value.trim(); + } + value = properties.getProperty(BATCH_SIZE); + if (value != null) { + batchSize = Integer.parseInt(value.trim()); + } } /** diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index 6a77455cca..bd249ebdd2 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -49,7 +49,10 @@ protected Server startServer0(int device) throws IOException { lang = lang.substring(0, lang.indexOf("-")); } - pb.command(python, script, model, Integer.toString(device), Integer.toString(threads), lang); + String precision = transcriptConfig.getPrecision(); + String batchSize = Integer.toString(transcriptConfig.getBatchSize()); + + pb.command(python, script, model, Integer.toString(device), Integer.toString(threads), lang, precision, batchSize); Process process = pb.start(); From 0c68009862d2f998e35219e828857edd36c4526c Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 18:53:00 -0300 Subject: [PATCH 24/31] '#1823: change default precision from float32 to int8 --- iped-app/resources/config/conf/AudioTranscriptConfig.txt | 4 ++-- iped-app/resources/scripts/tasks/WhisperProcess.py | 2 +- .../main/java/iped/engine/config/AudioTranscriptConfig.java | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 2dd98e6310..76cd11653f 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -111,8 +111,8 @@ minWordScore = 0.5 whisperModel = medium # Compute type precision. This affects accuracy, speed and memory usage. -# Possible values: float32, float16 (GPU only), int8 -precision = float32 +# Possible values: float32 (better), float16 (recommended for GPU), int8 (faster) +precision = int8 # Batch size (number of parallel transcriptions). If you have a GPU with enough memory, # increasing this value to e.g. 16 can speed up transcribing long audios up to 10x. diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index 073b39297c..e8d5752979 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -45,7 +45,7 @@ def main(): print('FAILED to load model on GPU, fallbacking to CPU!', file=sys.stderr) deviceId = 'cpu' if compute_type == 'float16': # not supported on CPU - compute_type = 'float32' + compute_type = 'int8' model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) else: raise e diff --git a/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java b/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java index b7ea08e500..7e118e70d4 100644 --- a/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java +++ b/iped-engine/src/main/java/iped/engine/config/AudioTranscriptConfig.java @@ -51,7 +51,7 @@ public class AudioTranscriptConfig extends AbstractTaskPropertiesConfig { private String remoteService; private String googleModel; private boolean skipKnownFiles = true; - private String precision = "float32"; + private String precision = "int8"; private int batchSize = 1; public String getPrecision() { From 2861ea6e0908e58e2ba145bdb64978c331096fd6 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 19:00:29 -0300 Subject: [PATCH 25/31] '#1823: update comments with JonatasGrosman's fine tuned large-v2 model --- iped-app/resources/config/conf/AudioTranscriptConfig.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 76cd11653f..86a1e02252 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -103,7 +103,7 @@ minWordScore = 0.5 # Local WhisperTranscriptTask options ######################################### -# Possible values: tiny, base, small, medium, large-v3 +# Possible values: tiny, base, small, medium, large-v3, dwhoelz/whisper-large-pt-cv11-ct2 # large-v3 is much better than medium, but 2x slower and uses 2x more memory. # If you know the language you want to transcribe, please set the 'language' option above. # 'language = auto' uses the 'locale' set on LocalConfig.txt From dd206f870b9496eb4331b7c45565d482d7a7ec8f Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sun, 28 Apr 2024 22:25:50 -0300 Subject: [PATCH 26/31] '#1823: update python package to include needed docopt-0.6.2 lib --- iped-app/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iped-app/pom.xml b/iped-app/pom.xml index 07d95c32e9..a3c30e581c 100644 --- a/iped-app/pom.xml +++ b/iped-app/pom.xml @@ -125,7 +125,7 @@ org.python python-jep-dlib - 3.9.12-4.0.3-19.23.1 + 3.9.12-4.0.3-19.23.1-2 zip false ${release.dir} From 47371727ccc85138ae68b99f54547d0afa406969 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Tue, 30 Apr 2024 14:11:32 -0300 Subject: [PATCH 27/31] '#1823: add a better error message if FFmpeg is not found on PATH --- .../task/transcript/WhisperTranscriptTask.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index bd249ebdd2..dbb416c5c1 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -7,6 +7,7 @@ import java.io.InputStreamReader; import java.util.Arrays; import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.lang3.SystemUtils; import org.apache.logging.log4j.LogManager; @@ -15,6 +16,8 @@ import iped.configuration.IConfigurationDirectory; import iped.engine.config.AudioTranscriptConfig; import iped.engine.config.Configuration; +import iped.engine.config.ConfigurationManager; +import iped.exception.IPEDException; public class WhisperTranscriptTask extends Wav2Vec2TranscriptTask { @@ -24,6 +27,20 @@ public class WhisperTranscriptTask extends Wav2Vec2TranscriptTask { private static final String LIBRARY_LOADED = "library_loaded"; private static final String MODEL_LOADED = "model_loaded"; + private static final AtomicBoolean ffmpegTested = new AtomicBoolean(); + + @Override + public void init(ConfigurationManager configurationManager) throws Exception { + if (!ffmpegTested.getAndSet(true)) { + try { + Runtime.getRuntime().exec("ffmpeg"); + } catch (IOException e) { + throw new IPEDException("Error checking FFmpeg presence, is it on PATH?"); + } + } + super.init(configurationManager); + } + @Override protected Server startServer0(int device) throws IOException { if (numProcesses != null && device == numProcesses) { From 25654b87f4261bcdb2f8ae59daf338a5d48b83fe Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 25 May 2024 18:15:40 -0300 Subject: [PATCH 28/31] '#1823: support both whisperx and faster_whisper, try whisperx first --- .../resources/scripts/tasks/WhisperProcess.py | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/iped-app/resources/scripts/tasks/WhisperProcess.py b/iped-app/resources/scripts/tasks/WhisperProcess.py index e8d5752979..240c8a9bee 100644 --- a/iped-app/resources/scripts/tasks/WhisperProcess.py +++ b/iped-app/resources/scripts/tasks/WhisperProcess.py @@ -21,7 +21,12 @@ def main(): if language == 'detect': language = None - import whisperx + try: + import whisperx + whisperx_found = True + except: + import faster_whisper + whisperx_found = False print(library_loaded, file=stdout, flush=True) @@ -37,16 +42,22 @@ def main(): deviceNum = 0 try: - model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) - + if whisperx_found: + model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) + else: + model = faster_whisper.WhisperModel(modelName, device=deviceId, device_index=deviceNum, cpu_threads=threads, compute_type=compute_type) + except Exception as e: if deviceId != 'cpu': # loading on GPU failed (OOM?), try on CPU - print('FAILED to load model on GPU, fallbacking to CPU!', file=sys.stderr) + print('FAILED to load model on GPU, OOM? Fallbacking to CPU...', file=sys.stderr) deviceId = 'cpu' if compute_type == 'float16': # not supported on CPU compute_type = 'int8' - model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) + if whisperx_found: + model = whisperx.load_model(modelName, device=deviceId, device_index=deviceNum, threads=threads, compute_type=compute_type, language=language) + else: + model = faster_whisper.WhisperModel(modelName, device=deviceId, cpu_threads=threads, compute_type=compute_type) else: raise e @@ -66,12 +77,18 @@ def main(): transcription = '' logprobs = [] try: - audio = whisperx.load_audio(line) - result = model.transcribe(audio, batch_size=batch_size, language=language) - for segment in result['segments']: - transcription += segment['text'] - if 'avg_logprob' in segment: - logprobs.append(segment['avg_logprob']) + if whisperx_found: + audio = whisperx.load_audio(line) + result = model.transcribe(audio, batch_size=batch_size, language=language) + for segment in result['segments']: + transcription += segment['text'] + if 'avg_logprob' in segment: + logprobs.append(segment['avg_logprob']) + else: + segments, info = model.transcribe(audio=line, language=language, beam_size=5, vad_filter=True) + for segment in segments: + transcription += segment.text + logprobs.append(segment.avg_logprob) except Exception as e: msg = repr(e).replace('\n', ' ').replace('\r', ' ') From a60330a12a9ce2350cfa44375e6b074b6dfecbd6 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 25 May 2024 18:16:15 -0300 Subject: [PATCH 29/31] '#1823: update error message about missing libraries --- .../java/iped/engine/task/transcript/WhisperTranscriptTask.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index dbb416c5c1..278a0ef203 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -80,7 +80,7 @@ protected Server startServer0(int device) throws IOException { String line = reader.readLine(); if (!LIBRARY_LOADED.equals(line)) { - throw new StartupException("'whisperx' python lib not loaded correctly. Have you installed it?"); + throw new StartupException("Neither 'faster_whisper' nor 'whisperx' python libraries were loaded correctly. Have you installed one of them?"); } int cudaCount = Integer.valueOf(reader.readLine()); From 067fc8fbe5fac18fccfdd6a27686b4a4dd872d0f Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 25 May 2024 18:21:21 -0300 Subject: [PATCH 30/31] '#1823: update config files comments --- iped-app/resources/config/IPEDConfig.txt | 10 ++++++---- .../resources/config/conf/AudioTranscriptConfig.txt | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/iped-app/resources/config/IPEDConfig.txt b/iped-app/resources/config/IPEDConfig.txt index 98d7e024c6..5962493875 100644 --- a/iped-app/resources/config/IPEDConfig.txt +++ b/iped-app/resources/config/IPEDConfig.txt @@ -97,10 +97,12 @@ enableMinIO = false enableOCR = false # Enable audio transcription. -# Default implementation uses VOSK transcription on local CPU (slow and not good accuracy). -# You can change it to a local Facebook Wav2Vec2 implementation (slower on CPU, faster on GPU and good accuracy) -# or remote Microsoft Azure or Google Cloud services (faster and good accuracy). -# Configure it in conf/AudioTranscriptConfig.txt +# Default implementation uses VOSK transcription on local CPU (faster but bad accuracy). +# You can change the algorithm into conf/AudioTranscriptConfig.txt: +# - Wav2Vec2 algorithm (slower and good accuracy) +# - Whisper algorithm (much slower but better accuracy) +# - Google Cloud (about $1.00 per hour cost) +# - Microsoft Azure (about $1.00 per hour cost) enableAudioTranscription = false # Enables carving. "addUnallocated" must be enabled to scan unallocated space. diff --git a/iped-app/resources/config/conf/AudioTranscriptConfig.txt b/iped-app/resources/config/conf/AudioTranscriptConfig.txt index 86a1e02252..7814aa1331 100644 --- a/iped-app/resources/config/conf/AudioTranscriptConfig.txt +++ b/iped-app/resources/config/conf/AudioTranscriptConfig.txt @@ -117,6 +117,7 @@ precision = int8 # Batch size (number of parallel transcriptions). If you have a GPU with enough memory, # increasing this value to e.g. 16 can speed up transcribing long audios up to 10x. # Test what is the better value for your GPU before hitting OOM. +# This works just if you are using whisperx library instead of faster_whisper batchSize = 1 ######################################### From f8b3f5f55c31b6685b086d8ffd36b794e7843231 Mon Sep 17 00:00:00 2001 From: Luis Nassif Date: Sat, 25 May 2024 18:25:38 -0300 Subject: [PATCH 31/31] '#1823: log warning instead of aborting if FFmpeg in not on PATH --- .../iped/engine/task/transcript/WhisperTranscriptTask.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java index 278a0ef203..077380d83d 100644 --- a/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java +++ b/iped-engine/src/main/java/iped/engine/task/transcript/WhisperTranscriptTask.java @@ -17,7 +17,6 @@ import iped.engine.config.AudioTranscriptConfig; import iped.engine.config.Configuration; import iped.engine.config.ConfigurationManager; -import iped.exception.IPEDException; public class WhisperTranscriptTask extends Wav2Vec2TranscriptTask { @@ -35,7 +34,7 @@ public void init(ConfigurationManager configurationManager) throws Exception { try { Runtime.getRuntime().exec("ffmpeg"); } catch (IOException e) { - throw new IPEDException("Error checking FFmpeg presence, is it on PATH?"); + logger.warn("FFmpeg not found on PATH, transcription won't work if you switched to WhisperX library."); } } super.init(configurationManager);