diff --git a/crates/goose-server/src/routes/transcribe.rs b/crates/goose-server/src/routes/transcribe.rs index 151415555..7e46d275c 100644 --- a/crates/goose-server/src/routes/transcribe.rs +++ b/crates/goose-server/src/routes/transcribe.rs @@ -1,16 +1,12 @@ -use axum::{ - extract::Multipart, - routing::post, - Json, Router, -}; +use axum::{extract::Multipart, routing::post, Json, Router}; use serde_json::json; -use std::process::Command; -use tempfile::Builder; use std::io::Write; -use tokio::fs; -use tower_http::cors::{CorsLayer, Any}; use std::path::PathBuf; +use std::process::Command; use std::sync::Once; +use tempfile::Builder; +use tokio::fs; +use tower_http::cors::{Any, CorsLayer}; static INIT: Once = Once::new(); @@ -27,7 +23,7 @@ fn ensure_whisper() { if !whisper_path.exists() { println!("Building whisper..."); let whisper_dir = project_dir.join("whisper.cpp"); - + // Build whisper let status = Command::new("make") .current_dir(&whisper_dir) @@ -44,7 +40,7 @@ fn ensure_whisper() { if !model_path.exists() { println!("Downloading whisper model..."); let whisper_dir = project_dir.join("whisper.cpp"); - + // Download model let status = Command::new("bash") .current_dir(&whisper_dir) @@ -66,22 +62,20 @@ pub fn routes() -> Router { // Ensure whisper is installed when creating routes ensure_whisper(); - Router::new() - .route("/transcribe", post(transcribe)) - .layer( - CorsLayer::new() - .allow_origin(Any) - .allow_methods(Any) - .allow_headers(Any) - ) + Router::new().route("/transcribe", post(transcribe)).layer( + CorsLayer::new() + .allow_origin(Any) + .allow_methods(Any) + .allow_headers(Any), + ) } async fn transcribe(mut multipart: Multipart) -> Json { eprintln!("Starting transcription process..."); - + while let Some(field) = multipart.next_field().await.unwrap() { eprintln!("Processing multipart field: {:?}", field.name()); - + if let Ok(data) = field.bytes().await { eprintln!("Received audio data of size: {} bytes", data.len()); if data.len() == 0 { @@ -104,7 +98,7 @@ async fn transcribe(mut multipart: Multipart) -> Json { } }; let webm_path = webm_file.path().to_str().unwrap().to_string(); - + let wav_file = match Builder::new().suffix(".wav").tempfile() { Ok(file) => file, Err(e) => { @@ -116,7 +110,7 @@ async fn transcribe(mut multipart: Multipart) -> Json { } }; let wav_path = wav_file.path().to_str().unwrap().to_string(); - + // Write the WebM data match webm_file.as_file().write_all(&data) { Ok(_) => eprintln!("Successfully wrote WebM data to temporary file"), @@ -187,7 +181,7 @@ async fn transcribe(mut multipart: Multipart) -> Json { eprintln!("Analyzing WebM file with FFprobe..."); let ffprobe_webm = Command::new("ffprobe") .arg("-v") - .arg("error") // Only show errors + .arg("error") // Only show errors .arg("-show_format") .arg("-show_streams") .arg(&webm_path) @@ -197,9 +191,12 @@ async fn transcribe(mut multipart: Multipart) -> Json { let webm_probe_output = String::from_utf8_lossy(&ffprobe_webm.stdout); eprintln!("WebM FFprobe analysis:"); eprintln!("{}", webm_probe_output); - + if !ffprobe_webm.status.success() { - eprintln!("WebM FFprobe error: {}", String::from_utf8_lossy(&ffprobe_webm.stderr)); + eprintln!( + "WebM FFprobe error: {}", + String::from_utf8_lossy(&ffprobe_webm.stderr) + ); return Json(json!({ "success": false, "error": format!("Invalid WebM file: {}", String::from_utf8_lossy(&ffprobe_webm.stderr)) @@ -211,19 +208,19 @@ async fn transcribe(mut multipart: Multipart) -> Json { let ffmpeg_output = Command::new("ffmpeg") .arg("-hide_banner") .arg("-loglevel") - .arg("debug") // Increased logging level + .arg("debug") // Increased logging level .arg("-i") .arg(&webm_path) - .arg("-vn") // Ignore video stream if present + .arg("-vn") // Ignore video stream if present .arg("-acodec") - .arg("pcm_s16le") // Force audio codec + .arg("pcm_s16le") // Force audio codec .arg("-ar") - .arg("16000") // Sample rate that whisper expects + .arg("16000") // Sample rate that whisper expects .arg("-ac") - .arg("1") // Mono audio + .arg("1") // Mono audio .arg("-f") - .arg("wav") // Force WAV format - .arg("-y") // Overwrite output file + .arg("wav") // Force WAV format + .arg("-y") // Overwrite output file .arg(&wav_path) .output() .unwrap(); @@ -231,7 +228,7 @@ async fn transcribe(mut multipart: Multipart) -> Json { eprintln!("FFmpeg conversion details:"); eprintln!("stdout: {}", String::from_utf8_lossy(&ffmpeg_output.stdout)); eprintln!("stderr: {}", String::from_utf8_lossy(&ffmpeg_output.stderr)); - + if !ffmpeg_output.status.success() { eprintln!("FFmpeg conversion failed!"); return Json(json!({ @@ -266,7 +263,7 @@ async fn transcribe(mut multipart: Multipart) -> Json { eprintln!("Analyzing WAV file with FFprobe..."); let ffprobe_wav = Command::new("ffprobe") .arg("-v") - .arg("error") // Only show errors + .arg("error") // Only show errors .arg("-show_format") .arg("-show_streams") .arg(&wav_path) @@ -278,7 +275,10 @@ async fn transcribe(mut multipart: Multipart) -> Json { eprintln!("{}", wav_probe_output); if !ffprobe_wav.status.success() { - eprintln!("WAV FFprobe error: {}", String::from_utf8_lossy(&ffprobe_wav.stderr)); + eprintln!( + "WAV FFprobe error: {}", + String::from_utf8_lossy(&ffprobe_wav.stderr) + ); return Json(json!({ "success": false, "error": format!("Invalid WAV file: {}", String::from_utf8_lossy(&ffprobe_wav.stderr)) @@ -303,8 +303,14 @@ async fn transcribe(mut multipart: Multipart) -> Json { .unwrap(); eprintln!("Whisper process completed"); - eprintln!("Whisper stdout: {}", String::from_utf8_lossy(&output.stdout)); - eprintln!("Whisper stderr: {}", String::from_utf8_lossy(&output.stderr)); + eprintln!( + "Whisper stdout: {}", + String::from_utf8_lossy(&output.stdout) + ); + eprintln!( + "Whisper stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); if output.status.success() { // Read the output text file @@ -314,7 +320,7 @@ async fn transcribe(mut multipart: Multipart) -> Json { // Clean up temporary files eprintln!("Cleaning up temporary files..."); let _ = fs::remove_file(&txt_path).await; - + eprintln!("Transcription successful: {}", text.trim()); return Json(json!({ "success": true, @@ -332,7 +338,10 @@ async fn transcribe(mut multipart: Multipart) -> Json { } else { eprintln!("Whisper process failed"); eprintln!("Error output: {}", String::from_utf8_lossy(&output.stderr)); - eprintln!("Standard output: {}", String::from_utf8_lossy(&output.stdout)); + eprintln!( + "Standard output: {}", + String::from_utf8_lossy(&output.stdout) + ); return Json(json!({ "success": false, "error": format!("Whisper failed: {}", String::from_utf8_lossy(&output.stderr)) diff --git a/ui/desktop/src/components/AudioRecorder.tsx b/ui/desktop/src/components/AudioRecorder.tsx index 4b4c20e8a..a07195498 100644 --- a/ui/desktop/src/components/AudioRecorder.tsx +++ b/ui/desktop/src/components/AudioRecorder.tsx @@ -4,16 +4,135 @@ import { Mic, Square } from 'lucide-react'; import { getApiUrl } from "../config"; import WaveSurfer from 'wavesurfer.js'; import RecordPlugin from 'wavesurfer.js/dist/plugins/record.esm.js'; +declare class Blob{} +declare class FormData{} + +// Separate button component +export const AudioButton = ({ + isRecording, + onClick, +}: { + isRecording: boolean; + onClick: () => void; +}) => ( + +); + +// Separate waveform component with its own state management +export const AudioWaveform = React.forwardRef< + HTMLDivElement, + { + isRecording: boolean; + onRecordEnd?: (blob: Blob) => void; + className?: string; + } +>(({ isRecording, onRecordEnd, className = '' }, ref) => { + const wavesurferRef = useRef(null); + const recordPluginRef = useRef(null); + const [progress, setProgress] = useState('00:00'); -interface AudioRecorderProps { - onTranscription: (text: string) => void; -} + const handleRecordProgress = useCallback((time: number) => { + const minutes = Math.floor((time % 3600000) / 60000); + const seconds = Math.floor((time % 60000) / 1000); + const formattedTime = [minutes, seconds] + .map(v => v < 10 ? '0' + v : v) + .join(':'); + setProgress(formattedTime); + }, []); + + useEffect(() => { + const container = ref as React.RefObject; + if (!container.current) return; + + const wavesurfer = WaveSurfer.create({ + container: container.current, + waveColor: 'rgb(99, 102, 241)', // Indigo-600 + progressColor: 'rgb(79, 70, 229)', // Indigo-700 + height: 26, + barWidth: 2, + barGap: 1, + barRadius: 1, + normalize: true, + minPxPerSec: 50, // Increase this value to make the waveform wider + }); + + const recordPlugin = wavesurfer.registerPlugin( + RecordPlugin.create({ + renderRecordedAudio: false, + scrollingWaveform: false, + continuousWaveform: true, + continuousWaveformDuration: 30, + }) + ); + + if (onRecordEnd) { + recordPlugin.on('record-end', onRecordEnd); + } + recordPlugin.on('record-progress', handleRecordProgress); + + wavesurferRef.current = wavesurfer; + recordPluginRef.current = recordPlugin; -export function AudioRecorder({ onTranscription }: AudioRecorderProps) { + return () => { + wavesurfer.destroy(); + wavesurferRef.current = null; + recordPluginRef.current = null; + }; + }, [ref, onRecordEnd, handleRecordProgress]); + + useEffect(() => { + const recordPlugin = recordPluginRef.current; + if (!recordPlugin) return; + + const handleRecording = async () => { + if (isRecording) { + try { + await recordPlugin.startRecording(); + } catch (err) { + console.error('Failed to start recording:', err); + } + } else { + try { + if (recordPlugin.isRecording()) { + await recordPlugin.stopRecording(); + setProgress('00:00'); + } + } catch (err) { + console.error('Failed to stop recording:', err); + } + } + }; + + handleRecording(); + }, [isRecording]); + + return ( +
+
+
+ ); +}); + +AudioWaveform.displayName = 'AudioWaveform'; + +// Main AudioRecorder component that combines both +export function AudioRecorder({ onTranscription, containerClassName }: { + onTranscription: (text: string) => void; + containerClassName?: string; +}) { const [isRecording, setIsRecording] = useState(false); - const [progress, setProgress] = useState('00:00'); - const wavesurferRef = useRef(null); - const recordPluginRef = useRef(null); const micContainerRef = useRef(null); const handleRecordEnd = useCallback(async (blob: Blob) => { @@ -43,108 +162,19 @@ export function AudioRecorder({ onTranscription }: AudioRecorderProps) { } }, [onTranscription]); - const handleRecordProgress = useCallback((time: number) => { - const minutes = Math.floor((time % 3600000) / 60000); - const seconds = Math.floor((time % 60000) / 1000); - const formattedTime = [minutes, seconds] - .map(v => v < 10 ? '0' + v : v) - .join(':'); - setProgress(formattedTime); + const handleToggleRecording = useCallback(() => { + setIsRecording(prev => !prev); }, []); - useEffect(() => { - let wavesurfer: WaveSurfer | null = null; - let recordPlugin: any = null; - - const initializeWaveSurfer = () => { - if (!micContainerRef.current) return; - - // Create new WaveSurfer instance - wavesurfer = WaveSurfer.create({ - container: micContainerRef.current, - waveColor: 'rgb(99, 102, 241)', // Indigo-600 - progressColor: 'rgb(79, 70, 229)', // Indigo-700 - height: 40, - }); - - // Initialize Record plugin - recordPlugin = wavesurfer.registerPlugin( - RecordPlugin.create({ - renderRecordedAudio: false, - scrollingWaveform: false, - continuousWaveform: true, - continuousWaveformDuration: 30, - }) - ); - - // Set up event handlers - recordPlugin.on('record-end', handleRecordEnd); - recordPlugin.on('record-progress', handleRecordProgress); - - // Store references - wavesurferRef.current = wavesurfer; - recordPluginRef.current = recordPlugin; - }; - - initializeWaveSurfer(); - - // Cleanup - return () => { - if (wavesurfer) { - wavesurfer.destroy(); - } - wavesurferRef.current = null; - recordPluginRef.current = null; - }; - }, [handleRecordEnd, handleRecordProgress]); - - const startRecording = async () => { - console.log('Attempting to start recording...'); - try { - if (!recordPluginRef.current) { - console.error('Record plugin not initialized'); - return; - } - - await recordPluginRef.current.startRecording(); - console.log('Recording started!'); - setIsRecording(true); - } catch (err) { - console.error('Failed to start recording:', err); - } - }; - - const stopRecording = async () => { - if (!recordPluginRef.current || !isRecording) return; - - console.log('Stopping recording...'); - try { - await recordPluginRef.current.stopRecording(); - setIsRecording(false); - setProgress('00:00'); - } catch (err) { - console.error('Failed to stop recording:', err); - } - }; - return ( -
-
-
- -
+
+ +
); } diff --git a/ui/desktop/src/components/Input.tsx b/ui/desktop/src/components/Input.tsx index 9d8910e19..1fa540e39 100644 --- a/ui/desktop/src/components/Input.tsx +++ b/ui/desktop/src/components/Input.tsx @@ -3,7 +3,10 @@ import { Button } from './ui/button'; import Send from './ui/Send'; import Stop from './ui/Stop'; import { Paperclip } from 'lucide-react'; -import { AudioRecorder } from './AudioRecorder'; +import { getApiUrl } from "../config"; +import { AudioButton, AudioWaveform } from './AudioRecorder'; +declare class Blob{} +declare class FormData{} interface InputProps { handleSubmit: (e: React.FormEvent) => void; @@ -27,7 +30,9 @@ export default function Input({ onStop }: InputProps) { const [value, setValue] = useState(''); + const [isRecording, setIsRecording] = useState(false); const textAreaRef = useRef(null); + const waveformRef = useRef(null); useEffect(() => { if (textAreaRef.current && !disabled) { @@ -81,36 +86,69 @@ export default function Input({ } }; - const handleTranscription = (text: string) => { - if(text != undefined) { - setValue(text); - textAreaRef.current?.focus(); + const handleRecordEnd = async (blob: Blob) => { + try { + console.log('Recording completed, size:', blob.size, 'type:', blob.type); + const formData = new FormData(); + formData.append('audio', blob, 'audio.webm'); + + const response = await fetch(getApiUrl('/transcribe'), { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + throw new Error('Transcription failed'); + } + + const result = await response.json(); + console.log('Received response:', result); + if (result.success) { + setValue(result.text); + textAreaRef.current?.focus(); + } else { + console.error('Transcription error:', result.error); + } + } catch (err) { + console.error('Transcription error:', err); } }; return (
-