diff --git a/cmake/FindFFmpeg.cmake b/cmake/FindFFmpeg.cmake new file mode 100644 index 00000000000..19dc751605e --- /dev/null +++ b/cmake/FindFFmpeg.cmake @@ -0,0 +1,163 @@ +# From +# https://github.com/snikulov/cmake-modules/blob/master/FindFFmpeg.cmake +# +# vim: ts=2 sw=2 +# - Try to find the required ffmpeg components(default: AVFORMAT, AVUTIL, AVCODEC) +# +# Once done this will define +# FFMPEG_FOUND - System has the all required components. +# FFMPEG_INCLUDE_DIRS - Include directory necessary for using the required components headers. +# FFMPEG_LIBRARIES - Link these to use the required ffmpeg components. +# FFMPEG_DEFINITIONS - Compiler switches required for using the required ffmpeg components. +# +# For each of the components it will additionally set. +# - AVCODEC +# - AVDEVICE +# - AVFORMAT +# - AVFILTER +# - AVUTIL +# - POSTPROC +# - SWSCALE +# the following variables will be defined +# _FOUND - System has +# _INCLUDE_DIRS - Include directory necessary for using the headers +# _LIBRARIES - Link these to use +# _DEFINITIONS - Compiler switches required for using +# _VERSION - The components version +# +# Copyright (c) 2006, Matthias Kretz, +# Copyright (c) 2008, Alexander Neundorf, +# Copyright (c) 2011, Michael Jansen, +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + +include(FindPackageHandleStandardArgs) + +# The default components were taken from a survey over other FindFFMPEG.cmake files +if (NOT FFmpeg_FIND_COMPONENTS) + set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE) +endif() + +# +### Macro: set_component_found +# +# Marks the given component as found if both *_LIBRARIES AND *_INCLUDE_DIRS is present. +# +macro(set_component_found _component ) + if (${_component}_LIBRARIES AND ${_component}_INCLUDE_DIRS) + message(DEBUG " - ${_component} found.") + set(${_component}_FOUND TRUE) + else () + message(DEBUG " - ${_component} not found.") + endif () +endmacro() + +# +### Macro: find_component +# +# Checks for the given component by invoking pkgconfig and then looking up the libraries and +# include directories. +# +macro(find_component _component _pkgconfig _library _header) + + if (NOT WIN32) + # use pkg-config to get the directories and then use these values + # in the FIND_PATH() and FIND_LIBRARY() calls + find_package(PkgConfig) + if (PKG_CONFIG_FOUND) + pkg_check_modules(PC_${_component} ${_pkgconfig}) + message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDEDIR}") + message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDE_DIRS}") + message(STATUS "${PC_${_component}_CFLAGS}") + endif () + endif (NOT WIN32) + + + find_path(${_component}_INCLUDE_DIRS ${_header} + HINTS + ${PC_${_component}_INCLUDEDIR} + ${PC_${_component}_INCLUDE_DIRS} + PATH_SUFFIXES + ffmpeg + ) + + # CMake's default is to search first for shared libraries and then for static libraries. + # Todo later: add option to prefer static libs over dynamic: + find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a + HINTS + ${PC_${_component}_LIBDIR} + ${PC_${_component}_LIBRARY_DIRS} + ) + + set(${_component}_DEFINITIONS ${PC_${_component}_CFLAGS_OTHER} CACHE STRING "The ${_component} CFLAGS.") + set(${_component}_VERSION ${PC_${_component}_VERSION} CACHE STRING "The ${_component} version number.") + + set_component_found(${_component}) + + mark_as_advanced( + ${_component}_INCLUDE_DIRS + ${_component}_LIBRARIES + ${_component}_DEFINITIONS + ${_component}_VERSION) + +endmacro() + + +# Check for cached results. If there are skip the costly part. +if (NOT FFMPEG_LIBRARIES) + + # Check for all possible component. + find_component(AVCODEC libavcodec avcodec libavcodec/avcodec.h) + find_component(AVFORMAT libavformat avformat libavformat/avformat.h) + find_component(AVDEVICE libavdevice avdevice libavdevice/avdevice.h) + #find_component(AVRESAMPLE libavresample avresample libavresample/avresample.h) # old name for swresample + find_component(AVUTIL libavutil avutil libavutil/avutil.h) + find_component(AVFILTER libavfilter avfilter libavfilter/avfilter.h) + find_component(SWSCALE libswscale swscale libswscale/swscale.h) + find_component(POSTPROC libpostproc postproc libpostproc/postprocess.h) + find_component(SWRESAMPLE libswresample swresample libswresample/swresample.h) + + # Check if the required components were found and add their stuff to the FFMPEG_* vars. + foreach (_component ${FFmpeg_FIND_COMPONENTS}) + if (${_component}_FOUND) + # message(STATUS "Required component ${_component} present.") + set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} ${${_component}_LIBRARIES}) + set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} ${${_component}_DEFINITIONS}) + list(APPEND FFMPEG_INCLUDE_DIRS ${${_component}_INCLUDE_DIRS}) + else () + # message(STATUS "Required component ${_component} missing.") + endif () + endforeach () + + # Build the include path with duplicates removed. + if (FFMPEG_INCLUDE_DIRS) + list(REMOVE_DUPLICATES FFMPEG_INCLUDE_DIRS) + endif () + + # cache the vars. + set(FFMPEG_INCLUDE_DIRS ${FFMPEG_INCLUDE_DIRS} CACHE STRING "The FFmpeg include directories." FORCE) + set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} CACHE STRING "The FFmpeg libraries." FORCE) + set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} CACHE STRING "The FFmpeg cflags." FORCE) + + mark_as_advanced(FFMPEG_INCLUDE_DIRS + FFMPEG_LIBRARIES + FFMPEG_DEFINITIONS) + +endif () + +# Now set the noncached _FOUND vars for the components. +# whisper.cpp does not need SWSCALE +foreach (_component AVCODEC AVDEVICE AVFORMAT AVRESAMPLE AVUTIL POSTPROCESS) + set_component_found(${_component}) +endforeach () + +# Compile the list of required vars +set(_FFmpeg_REQUIRED_VARS FFMPEG_LIBRARIES FFMPEG_INCLUDE_DIRS) +foreach (_component ${FFmpeg_FIND_COMPONENTS}) + list(APPEND _FFmpeg_REQUIRED_VARS ${_component}_LIBRARIES ${_component}_INCLUDE_DIRS) +endforeach () + +# Give a nice error message if some of the required vars are missing. +find_package_handle_standard_args(FFmpeg DEFAULT_MSG ${_FFmpeg_REQUIRED_VARS}) + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3b493e3db7e..24678e1c6ac 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -22,6 +22,10 @@ endif() set(TARGET common) +if (WHISPER_FFMPEG) + set(COMMON_SOURCES_FFMPEG ffmpeg-transcode.cpp) +endif() + add_library(${TARGET} STATIC common.h common.cpp @@ -29,6 +33,7 @@ add_library(${TARGET} STATIC common-ggml.cpp grammar-parser.h grammar-parser.cpp + ${COMMON_SOURCES_FFMPEG} ) include(DefaultTargetOptions) diff --git a/examples/common.cpp b/examples/common.cpp index 2c0cdf082ed..25a0272cf08 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -24,6 +24,11 @@ #include #endif +#ifdef WHISPER_FFMPEG +// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support +extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector & wav_data); +#endif + // Function to check if the next argument exists std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { if (i + 1 < argc && argv[i + 1][0] != '-') { @@ -637,7 +642,7 @@ bool is_wav_buffer(const std::string buf) { bool read_wav(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { drwav wav; - std::vector wav_data; // used for pipe input from stdin + std::vector wav_data; // used for pipe input from stdin or ffmpeg decoding output if (fname == "-") { { @@ -670,8 +675,19 @@ bool read_wav(const std::string & fname, std::vector& pcmf32, std::vector } } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { +#if defined(WHISPER_FFMPEG) + if (ffmpeg_decode_audio(fname, wav_data) != 0) { + fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str()); + return false; + } + if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { + fprintf(stderr, "error: failed to read wav data as wav \n"); + return false; + } +#else fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); return false; +#endif } if (wav.channels != 1 && wav.channels != 2) { diff --git a/examples/ffmpeg-transcode.cpp b/examples/ffmpeg-transcode.cpp new file mode 100644 index 00000000000..910cdf5700b --- /dev/null +++ b/examples/ffmpeg-transcode.cpp @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * transcode.c - convert audio file to WAVE + * + * Copyright (C) 2019 Andrew Clayton + * Copyright (C) 2024 William Tambellini + */ + +// Just for conveninent C++ API +#include +#include + +// C +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#include +#include +#include +#include +} + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#define WAVE_SAMPLE_RATE 16000 +#define AVIO_CTX_BUF_SZ 4096 + +static const char* ffmpegLog = getenv("FFMPEG_LOG"); +// Todo: add __FILE__ __LINE__ +#define LOG(...) \ + do { if (ffmpegLog) fprintf(stderr, __VA_ARGS__); } while(0) // C99 + +/* + * WAVE file header based on definition from + * https://gist.github.com/Jon-Schneider/8b7c53d27a7a13346a643dac9c19d34f + * + * We must ensure this structure doesn't have any holes or + * padding so we can just map it straight to the WAVE data. + */ +struct wave_hdr { + /* RIFF Header: "RIFF" */ + char riff_header[4]; + /* size of audio data + sizeof(struct wave_hdr) - 8 */ + int wav_size; + /* "WAVE" */ + char wav_header[4]; + + /* Format Header */ + /* "fmt " (includes trailing space) */ + char fmt_header[4]; + /* Should be 16 for PCM */ + int fmt_chunk_size; + /* Should be 1 for PCM. 3 for IEEE Float */ + s16 audio_format; + s16 num_channels; + int sample_rate; + /* + * Number of bytes per second + * sample_rate * num_channels * bit_depth/8 + */ + int byte_rate; + /* num_channels * bytes per sample */ + s16 sample_alignment; + /* bits per sample */ + s16 bit_depth; + + /* Data Header */ + /* "data" */ + char data_header[4]; + /* + * size of audio + * number of samples * num_channels * bit_depth/8 + */ + int data_bytes; +} __attribute__((__packed__)); + +struct audio_buffer { + u8 *ptr; + int size; /* size left in the buffer */ +}; + +static void set_wave_hdr(wave_hdr& wh, size_t size) { + memcpy(&wh.riff_header, "RIFF", 4); + wh.wav_size = size + sizeof(struct wave_hdr) - 8; + memcpy(&wh.wav_header, "WAVE", 4); + memcpy(&wh.fmt_header, "fmt ", 4); + wh.fmt_chunk_size = 16; + wh.audio_format = 1; + wh.num_channels = 1; + wh.sample_rate = WAVE_SAMPLE_RATE; + wh.sample_alignment = 2; + wh.bit_depth = 16; + wh.byte_rate = wh.sample_rate * wh.sample_alignment; + memcpy(&wh.data_header, "data", 4); + wh.data_bytes = size; +} + +static void write_wave_hdr(int fd, size_t size) { + struct wave_hdr wh; + set_wave_hdr(wh, size); + write(fd, &wh, sizeof(struct wave_hdr)); +} + +static int map_file(int fd, u8 **ptr, size_t *size) +{ + struct stat sb; + + fstat(fd, &sb); + *size = sb.st_size; + + *ptr = (u8*)mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (*ptr == MAP_FAILED) { + perror("mmap"); + return -1; + } + + return 0; +} + +static int read_packet(void *opaque, u8 *buf, int buf_size) +{ + struct audio_buffer *audio_buf = (audio_buffer*)opaque; + + buf_size = FFMIN(buf_size, audio_buf->size); + + /* copy internal buffer data to buf */ + memcpy(buf, audio_buf->ptr, buf_size); + audio_buf->ptr += buf_size; + audio_buf->size -= buf_size; + + return buf_size; +} + +static void convert_frame(struct SwrContext *swr, AVCodecContext *codec, + AVFrame *frame, s16 **data, int *size, bool flush) +{ + int nr_samples; + s64 delay; + u8 *buffer; + + delay = swr_get_delay(swr, codec->sample_rate); + nr_samples = av_rescale_rnd(delay + frame->nb_samples, + WAVE_SAMPLE_RATE, codec->sample_rate, + AV_ROUND_UP); + av_samples_alloc(&buffer, NULL, 1, nr_samples, AV_SAMPLE_FMT_S16, 0); + + /* + * !flush is used to check if we are flushing any remaining + * conversion buffers... + */ + nr_samples = swr_convert(swr, &buffer, nr_samples, + !flush ? (const u8 **)frame->data : NULL, + !flush ? frame->nb_samples : 0); + + *data = (s16*)realloc(*data, (*size + nr_samples) * sizeof(s16)); + memcpy(*data + *size, buffer, nr_samples * sizeof(s16)); + *size += nr_samples; + av_freep(&buffer); +} + +static bool is_audio_stream(const AVStream *stream) +{ + if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) + return true; + + return false; +} + +// Return non zero on error, 0 on success +// audio_buffer: input memory +// data: decoded output audio data (wav file) +// size: size of output data +static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size) +{ + LOG("decode_audio: input size: %d\n", audio_buf->size); + AVFormatContext *fmt_ctx; + AVIOContext *avio_ctx; + AVStream *stream; + AVCodecContext *codec; + AVPacket packet; + AVFrame *frame; + struct SwrContext *swr; + u8 *avio_ctx_buffer; + unsigned int i; + int stream_index = -1; + int err; + const size_t errbuffsize = 1024; + char errbuff[errbuffsize]; + + av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions) + + fmt_ctx = avformat_alloc_context(); + avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ); + LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ); + avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL); + fmt_ctx->pb = avio_ctx; + + // open the input stream and read header + err = avformat_open_input(&fmt_ctx, NULL, NULL, NULL); + if (err) { + LOG("Could not read audio buffer: %d: %s\n", err, av_make_error_string(errbuff, errbuffsize, err)); + return err; + } + + err = avformat_find_stream_info(fmt_ctx, NULL); + if (err < 0) { + LOG("Could not retrieve stream info from audio buffer: %d\n", err); + return err; + } + + for (i = 0; i < fmt_ctx->nb_streams; i++) { + if (is_audio_stream(fmt_ctx->streams[i])) { + stream_index = i; + break; + } + } + + if (stream_index == -1) { + LOG("Could not retrieve audio stream from buffer\n"); + return -1; + } + + stream = fmt_ctx->streams[stream_index]; + codec = avcodec_alloc_context3( + avcodec_find_decoder(stream->codecpar->codec_id)); + avcodec_parameters_to_context(codec, stream->codecpar); + err = avcodec_open2(codec, avcodec_find_decoder(codec->codec_id), + NULL); + if (err) { + LOG("Failed to open decoder for stream #%d in audio buffer\n", stream_index); + return err; + } + + /* prepare resampler */ + swr = swr_alloc(); + + av_opt_set_int(swr, "in_channel_count", codec->channels, 0); + av_opt_set_int(swr, "out_channel_count", 1, 0); + av_opt_set_int(swr, "in_channel_layout", codec->channel_layout, 0); + av_opt_set_int(swr, "out_channel_layout", AV_CH_LAYOUT_MONO, 0); + av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0); + av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0); + av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0); + av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0); + + swr_init(swr); + if (!swr_is_initialized(swr)) { + LOG("Resampler has not been properly initialized\n"); + return -1; + } + + av_init_packet(&packet); + frame = av_frame_alloc(); + if (!frame) { + LOG("Error allocating the frame\n"); + return -1; + } + + /* iterate through frames */ + *data = NULL; + *size = 0; + while (av_read_frame(fmt_ctx, &packet) >= 0) { + avcodec_send_packet(codec, &packet); + + err = avcodec_receive_frame(codec, frame); + if (err == AVERROR(EAGAIN)) + continue; + + convert_frame(swr, codec, frame, data, size, false); + } + /* Flush any remaining conversion buffers... */ + convert_frame(swr, codec, frame, data, size, true); + + av_frame_free(&frame); + swr_free(&swr); + //avio_context_free(); // todo? + avcodec_close(codec); + avformat_close_input(&fmt_ctx); + avformat_free_context(fmt_ctx); + + if (avio_ctx) { + av_freep(&avio_ctx->buffer); + av_freep(&avio_ctx); + } + + return 0; +} + +// in mem decoding/conversion/resampling: +// ifname: input file path +// owav_data: in mem wav file. Can be forwarded as it to whisper/drwav +// return 0 on success +int ffmpeg_decode_audio(const std::string &ifname, std::vector& owav_data) { + LOG("ffmpeg_decode_audio: %s\n", ifname.c_str()); + int ifd = open(ifname.c_str(), O_RDONLY); + if (ifd == -1) { + fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str()); + return -1; + } + u8 *ibuf = NULL; + size_t ibuf_size; + int err = map_file(ifd, &ibuf, &ibuf_size); + if (err) { + LOG("Couldn't map input file %s\n", ifname.c_str()); + return err; + } + LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size); + struct audio_buffer inaudio_buf; + inaudio_buf.ptr = ibuf; + inaudio_buf.size = ibuf_size; + + s16 *odata=NULL; + int osize=0; + + err = decode_audio(&inaudio_buf, &odata, &osize); + LOG("decode_audio returned %d \n", err); + if (err != 0) { + LOG("decode_audio failed\n"); + return err; + } + LOG("decode_audio output size: %d\n", osize); + + wave_hdr wh; + const size_t outdatasize = osize * sizeof(s16); + set_wave_hdr(wh, outdatasize); + owav_data.resize(sizeof(wave_hdr) + outdatasize); + // header: + memcpy(owav_data.data(), &wh, sizeof(wave_hdr)); + // the data: + memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16)); + + return 0; +} diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index 1bb16f58214..1e66e4b5cc8 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp) include(DefaultTargetOptions) -target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/samples/.gitignore b/samples/.gitignore index 72e8ffc0db8..e084659df25 100644 --- a/samples/.gitignore +++ b/samples/.gitignore @@ -1 +1,4 @@ * +!jfk.wave +!jfk.mp3 + diff --git a/samples/jfk.mp3 b/samples/jfk.mp3 new file mode 100644 index 00000000000..fbfa1d98973 Binary files /dev/null and b/samples/jfk.mp3 differ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5366f848b09..295bc48cf53 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -74,3 +74,14 @@ add_test(NAME ${TEST_TARGET} -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large") + +if (WHISPER_FFMPEG) + set(TEST_TARGET test-main-tiny-mp3) + # Check with reviewers: any way to check the output transcription via ctest (diff, ...)? + add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.mp3) + set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;mp3") +endif() +