diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt index 8b69984cee3..f0182b17a17 100644 --- a/torchaudio/csrc/ffmpeg/CMakeLists.txt +++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt @@ -2,8 +2,6 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}") find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil) add_library(ffmpeg INTERFACE) target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}") -target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}") - set( sources diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp index 66bd222c050..a465e91ab06 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp +++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -8,6 +9,8 @@ namespace torchaudio { namespace io { +using torchaudio::io::detail::libav; + //////////////////////////////////////////////////////////////////////////////// // AVDictionary //////////////////////////////////////////////////////////////////////////////// @@ -15,7 +18,7 @@ AVDictionary* get_option_dict(const c10::optional& option) { AVDictionary* opt = nullptr; if (option) { for (auto const& [key, value] : option.value()) { - av_dict_set(&opt, key.c_str(), value.c_str(), 0); + libav().av_dict_set(&opt, key.c_str(), value.c_str(), 0); } } return opt; @@ -26,10 +29,10 @@ void clean_up_dict(AVDictionary* p) { std::vector unused_keys; // Check and copy unused keys, clean up the original dictionary AVDictionaryEntry* t = nullptr; - while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { + while ((t = libav().av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { unused_keys.emplace_back(t->key); } - av_dict_free(&p); + libav().av_dict_free(&p); TORCH_CHECK( unused_keys.empty(), "Unexpected options: ", @@ -41,14 +44,14 @@ void clean_up_dict(AVDictionary* p) { // AVFormatContext //////////////////////////////////////////////////////////////////////////////// void AVFormatInputContextDeleter::operator()(AVFormatContext* p) { - avformat_close_input(&p); + libav().avformat_close_input(&p); }; AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p) : Wrapper(p) {} void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) { - avformat_free_context(p); + libav().avformat_free_context(p); }; AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) @@ -58,9 +61,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) // AVIO //////////////////////////////////////////////////////////////////////////////// void AVIOContextDeleter::operator()(AVIOContext* p) { - avio_flush(p); - av_freep(&p->buffer); - av_freep(&p); + libav().avio_flush(p); + libav().av_freep(&p->buffer); + libav().av_freep(&p); }; AVIOContextPtr::AVIOContextPtr(AVIOContext* p) @@ -70,13 +73,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p) // AVPacket //////////////////////////////////////////////////////////////////////////////// void AVPacketDeleter::operator()(AVPacket* p) { - av_packet_free(&p); + libav().av_packet_free(&p); }; AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper(p) {} AVPacketPtr alloc_avpacket() { - AVPacket* p = av_packet_alloc(); + AVPacket* p = libav().av_packet_alloc(); TORCH_CHECK(p, "Failed to allocate AVPacket object."); return AVPacketPtr{p}; } @@ -86,7 +89,7 @@ AVPacketPtr alloc_avpacket() { //////////////////////////////////////////////////////////////////////////////// AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){}; AutoPacketUnref::~AutoPacketUnref() { - av_packet_unref(p_); + libav().av_packet_unref(p_); } AutoPacketUnref::operator AVPacket*() const { return p_; @@ -96,13 +99,13 @@ AutoPacketUnref::operator AVPacket*() const { // AVFrame //////////////////////////////////////////////////////////////////////////////// void AVFrameDeleter::operator()(AVFrame* p) { - av_frame_free(&p); + libav().av_frame_free(&p); }; AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper(p) {} AVFramePtr alloc_avframe() { - AVFrame* p = av_frame_alloc(); + AVFrame* p = libav().av_frame_alloc(); TORCH_CHECK(p, "Failed to allocate AVFrame object."); return AVFramePtr{p}; }; @@ -111,7 +114,7 @@ AVFramePtr alloc_avframe() { // AVCodecContext //////////////////////////////////////////////////////////////////////////////// void AVCodecContextDeleter::operator()(AVCodecContext* p) { - avcodec_free_context(&p); + libav().avcodec_free_context(&p); }; AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) @@ -121,7 +124,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) // AVBufferRefPtr //////////////////////////////////////////////////////////////////////////////// void AutoBufferUnref::operator()(AVBufferRef* p) { - av_buffer_unref(&p); + libav().av_buffer_unref(&p); } AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) @@ -131,7 +134,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) // AVFilterGraph //////////////////////////////////////////////////////////////////////////////// void AVFilterGraphDeleter::operator()(AVFilterGraph* p) { - avfilter_graph_free(&p); + libav().avfilter_graph_free(&p); }; AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) @@ -141,7 +144,7 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) // AVCodecParameters //////////////////////////////////////////////////////////////////////////////// void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) { - avcodec_parameters_free(&codecpar); + libav().avcodec_parameters_free(&codecpar); } AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p) diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h index 0bae00c12d8..9c40d7fe117 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.h +++ b/torchaudio/csrc/ffmpeg/ffmpeg.h @@ -6,6 +6,9 @@ #include #include +#include +#include + extern "C" { #include #include @@ -29,21 +32,13 @@ namespace io { using OptionDict = std::map; -// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260 -// Starting from libavformat 59 (ffmpeg 5), -// AVInputFormat is const and related functions expect constant. -#if LIBAVFORMAT_VERSION_MAJOR >= 59 -#define AVFORMAT_CONST const -#else -#define AVFORMAT_CONST -#endif - // Replacement of av_err2str, which causes // `error: taking address of temporary array` // https://github.com/joncampbell123/composite-video-simulator/issues/5 av_always_inline std::string av_err2string(int errnum) { char str[AV_ERROR_MAX_STRING_SIZE]; - return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum); + detail::libav().av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE); + return str; } // Base structure that handles memory management. diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp index 797f0783494..ca4c92bb5c7 100644 --- a/torchaudio/csrc/ffmpeg/filter_graph.cpp +++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp @@ -1,12 +1,15 @@ #include +#include #include namespace torchaudio { namespace io { +using torchaudio::io::detail::libav; + namespace { AVFilterGraph* get_filter_graph() { - AVFilterGraph* ptr = avfilter_graph_alloc(); + AVFilterGraph* ptr = libav().avfilter_graph_alloc(); TORCH_CHECK(ptr, "Failed to allocate resouce."); ptr->nb_threads = 1; return ptr; @@ -32,7 +35,7 @@ std::string get_audio_src_args( time_base.num, time_base.den, sample_rate, - av_get_sample_fmt_name(format), + libav().av_get_sample_fmt_name(format), channel_layout); return std::string(args); } @@ -51,7 +54,7 @@ std::string get_video_src_args( "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d", width, height, - av_get_pix_fmt_name(format), + libav().av_get_pix_fmt_name(format), time_base.num, time_base.den, frame_rate.num, @@ -69,7 +72,7 @@ void FilterGraph::add_audio_src( int sample_rate, uint64_t channel_layout) { add_src( - avfilter_get_by_name("abuffer"), + libav().avfilter_get_by_name("abuffer"), get_audio_src_args(format, time_base, sample_rate, channel_layout)); } @@ -81,13 +84,13 @@ void FilterGraph::add_video_src( int height, AVRational sample_aspect_ratio) { add_src( - avfilter_get_by_name("buffer"), + libav().avfilter_get_by_name("buffer"), get_video_src_args( format, time_base, frame_rate, width, height, sample_aspect_ratio)); } void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { - int ret = avfilter_graph_create_filter( + int ret = libav().avfilter_graph_create_filter( &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph); TORCH_CHECK( ret >= 0, @@ -96,11 +99,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { } void FilterGraph::add_audio_sink() { - add_sink(avfilter_get_by_name("abuffersink")); + add_sink(libav().avfilter_get_by_name("abuffersink")); } void FilterGraph::add_video_sink() { - add_sink(avfilter_get_by_name("buffersink")); + add_sink(libav().avfilter_get_by_name("buffersink")); } void FilterGraph::add_sink(const AVFilter* buffersink) { @@ -114,7 +117,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) { // According to the other example // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html // `abuffersink` should not take options, and this resolved issue. - int ret = avfilter_graph_create_filter( + int ret = libav().avfilter_graph_create_filter( &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph); TORCH_CHECK(ret >= 0, "Failed to create output filter."); } @@ -131,15 +134,15 @@ class InOuts { public: InOuts(const char* name, AVFilterContext* pCtx) { - p = avfilter_inout_alloc(); + p = libav().avfilter_inout_alloc(); TORCH_CHECK(p, "Failed to allocate AVFilterInOut."); - p->name = av_strdup(name); + p->name = libav().av_strdup(name); p->filter_ctx = pCtx; p->pad_idx = 0; p->next = nullptr; } ~InOuts() { - avfilter_inout_free(&p); + libav().avfilter_inout_free(&p); } operator AVFilterInOut**() { return &p; @@ -156,7 +159,7 @@ void FilterGraph::add_process(const std::string& filter_description) { // If you are debugging this part of the code, you might get confused. InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx}; - int ret = avfilter_graph_parse_ptr( + int ret = libav().avfilter_graph_parse_ptr( graph, filter_description.c_str(), out, in, nullptr); TORCH_CHECK( @@ -167,11 +170,11 @@ void FilterGraph::add_process(const std::string& filter_description) { void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) { buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx; - int ret = avfilter_graph_config(graph, nullptr); + int ret = libav().avfilter_graph_config(graph, nullptr); TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret)); - // char* desc = avfilter_graph_dump(graph, NULL); + // char* desc = libav().avfilter_graph_dump(graph, NULL); // std::cerr << "Filter created:\n" << desc << std::endl; - // av_free(static_cast(desc)); + // libav().av_free(static_cast(desc)); } ////////////////////////////////////////////////////////////////////////////// @@ -191,7 +194,7 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const { ret.num_channels = l->ch_layout.nb_channels; #else // Before FFmpeg 5.1 - ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout); + ret.num_channels = libav().av_get_channel_layout_nb_channels(l->channel_layout); #endif break; } @@ -214,12 +217,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const { // Streaming process ////////////////////////////////////////////////////////////////////////////// int FilterGraph::add_frame(AVFrame* pInputFrame) { - return av_buffersrc_add_frame_flags( + return libav().av_buffersrc_add_frame_flags( buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF); } int FilterGraph::get_frame(AVFrame* pOutputFrame) { - return av_buffersink_get_frame(buffersink_ctx, pOutputFrame); + return libav().av_buffersink_get_frame(buffersink_ctx, pOutputFrame); } } // namespace io diff --git a/torchaudio/csrc/ffmpeg/hw_context.cpp b/torchaudio/csrc/ffmpeg/hw_context.cpp index a1d7f3c7a04..7341b8e0746 100644 --- a/torchaudio/csrc/ffmpeg/hw_context.cpp +++ b/torchaudio/csrc/ffmpeg/hw_context.cpp @@ -1,6 +1,10 @@ #include +#include namespace torchaudio::io { + +using detail::libav; + namespace { static std::mutex MUTEX; @@ -15,7 +19,7 @@ AVBufferRef* get_cuda_context(int index) { } if (CUDA_CONTEXT_CACHE.count(index) == 0) { AVBufferRef* p = nullptr; - int ret = av_hwdevice_ctx_create( + int ret = libav().av_hwdevice_ctx_create( &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0); TORCH_CHECK( ret >= 0, diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp index 7ccc7bd0bf9..3470d3b3ff6 100644 --- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp +++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp @@ -1,10 +1,14 @@ #include #include +#include #include #include namespace torchaudio { namespace io { + +using detail::libav; + namespace { std::map> get_versions() { @@ -12,7 +16,7 @@ std::map> get_versions() { #define add_version(NAME) \ { \ - int ver = NAME##_version(); \ + int ver = libav().NAME##_version(); \ ret.emplace( \ "lib" #NAME, \ std::make_tuple<>( \ @@ -35,7 +39,7 @@ std::map get_demuxers(bool req_device) { std::map ret; const AVInputFormat* fmt = nullptr; void* i = nullptr; - while ((fmt = av_demuxer_iterate(&i))) { + while ((fmt = libav().av_demuxer_iterate(&i))) { assert(fmt); bool is_device = [&]() { const AVClass* avclass = fmt->priv_class; @@ -52,7 +56,7 @@ std::map get_muxers(bool req_device) { std::map ret; const AVOutputFormat* fmt = nullptr; void* i = nullptr; - while ((fmt = av_muxer_iterate(&i))) { + while ((fmt = libav().av_muxer_iterate(&i))) { assert(fmt); bool is_device = [&]() { const AVClass* avclass = fmt->priv_class; @@ -71,10 +75,10 @@ std::map get_codecs( const AVCodec* c = nullptr; void* i = nullptr; std::map ret; - while ((c = av_codec_iterate(&i))) { + while ((c = libav().av_codec_iterate(&i))) { assert(c); - if ((req_encoder && av_codec_is_encoder(c)) || - (!req_encoder && av_codec_is_decoder(c))) { + if ((req_encoder && libav().av_codec_is_encoder(c)) || + (!req_encoder && libav().av_codec_is_decoder(c))) { if (c->type == type && c->name) { ret.emplace(c->name, c->long_name ? c->long_name : ""); } @@ -87,7 +91,7 @@ std::vector get_protocols(bool output) { void* opaque = nullptr; const char* name = nullptr; std::vector ret; - while ((name = avio_enum_protocols(&opaque, output))) { + while ((name = libav().avio_enum_protocols(&opaque, output))) { assert(name); ret.emplace_back(name); } @@ -95,7 +99,7 @@ std::vector get_protocols(bool output) { } std::string get_build_config() { - return avcodec_configuration(); + return libav().avcodec_configuration(); } ////////////////////////////////////////////////////////////////////////////// @@ -188,9 +192,10 @@ struct StreamWriterFileObj : private FileObj, public StreamWriterCustomIO { }; PYBIND11_MODULE(_torchaudio_ffmpeg, m) { - m.def("init", []() { avdevice_register_all(); }); - m.def("get_log_level", []() { return av_log_get_level(); }); - m.def("set_log_level", [](int level) { av_log_set_level(level); }); + m.def("test_dlopen", []() { detail::libav(); }); + m.def("init", []() { libav().avdevice_register_all(); }); + m.def("get_log_level", []() { return libav().av_log_get_level(); }); + m.def("set_log_level", [](int level) { libav().av_log_set_level(level); }); m.def("get_versions", &get_versions); m.def("get_muxers", []() { return get_muxers(false); }); m.def("get_demuxers", []() { return get_demuxers(false); }); @@ -246,21 +251,21 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { .def_property_readonly( "media_type", [](const OutputStreamInfo& o) -> std::string { - return av_get_media_type_string(o.media_type); + return libav().av_get_media_type_string(o.media_type); }) .def_property_readonly( "format", [](const OutputStreamInfo& o) -> std::string { switch (o.media_type) { case AVMEDIA_TYPE_AUDIO: - return av_get_sample_fmt_name((AVSampleFormat)(o.format)); + return libav().av_get_sample_fmt_name((AVSampleFormat)(o.format)); case AVMEDIA_TYPE_VIDEO: - return av_get_pix_fmt_name((AVPixelFormat)(o.format)); + return libav().av_get_pix_fmt_name((AVPixelFormat)(o.format)); default: TORCH_INTERNAL_ASSERT( false, "FilterGraph is returning unexpected media type: ", - av_get_media_type_string(o.media_type)); + libav().av_get_media_type_string(o.media_type)); } }) .def_readonly("sample_rate", &OutputStreamInfo::sample_rate) @@ -284,7 +289,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { .def_property_readonly( "media_type", [](const SrcStreamInfo& s) { - return av_get_media_type_string(s.media_type); + return libav().av_get_media_type_string(s.media_type); }) .def_readonly("codec_name", &SrcStreamInfo::codec_name) .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name) diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp index 99e33e8367f..469cb315bd9 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp @@ -1,4 +1,5 @@ #include +#include #include #ifdef USE_CUDA @@ -7,6 +8,8 @@ namespace torchaudio::io { +using detail::libav; + //////////////////////////////////////////////////////////////////////////////// // Audio //////////////////////////////////////////////////////////////////////////////// @@ -429,11 +432,11 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_NV12 == sw_fmt, "Expected NV12 format. Found: ", - av_get_pix_fmt_name(sw_fmt)); + libav().av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( @@ -506,11 +509,11 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_P010 == sw_fmt, "Expected P010 format. Found: ", - av_get_pix_fmt_name(sw_fmt)); + libav().av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( @@ -581,11 +584,11 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_YUV444P == sw_fmt, "Expected YUV444P format. Found: ", - av_get_pix_fmt_name(sw_fmt)); + libav().av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly for (int i = 0; i < num_channels; ++i) { diff --git a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp index 8caec7cb582..77944c67fb7 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp @@ -1,10 +1,14 @@ +#include #include namespace torchaudio { namespace io { + +using detail::libav; + void PacketBuffer::push_packet(AVPacket* packet) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null."); - AVPacket* p = av_packet_clone(packet); + AVPacket* p = libav().av_packet_clone(packet); TORCH_INTERNAL_ASSERT(p, "Failed to clone packet."); packets.emplace_back(p); } diff --git a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp index 147d0bc2d57..2cd83746889 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -5,6 +6,9 @@ namespace torchaudio::io { namespace detail { + +using detail::libav; + namespace { /////////////////////////////////////////////////////////////////////////////// @@ -48,7 +52,7 @@ FilterGraphFactory get_video_factory( f.add_video_sink(); f.add_process(filter_desc); if (hw_frames_ctx) { - f.create_filter(av_buffer_ref(hw_frames_ctx)); + f.create_filter(libav().av_buffer_ref(hw_frames_ctx)); } else { f.create_filter(); } @@ -139,7 +143,7 @@ struct ProcessImpl : public IPostDecodeProcess { if (ret >= 0) { buffer.push_frame(converter.convert(frame), frame->pts); } - av_frame_unref(frame); + libav().av_frame_unref(frame); } return ret; } @@ -159,7 +163,7 @@ std::unique_ptr get_unchunked_audio_process( TORCH_INTERNAL_ASSERT( i.type == AVMEDIA_TYPE_AUDIO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + libav().av_get_media_type_string(i.type)); using B = UnchunkedBuffer; @@ -226,7 +230,7 @@ std::unique_ptr get_unchunked_audio_process( } default: TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); + false, "Unexpected audio type:", libav().av_get_sample_fmt_name(fmt)); } } @@ -239,7 +243,7 @@ std::unique_ptr get_chunked_audio_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_AUDIO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + libav().av_get_media_type_string(i.type)); using B = ChunkedBuffer; B buffer{i.time_base, frames_per_chunk, num_chunks}; @@ -307,7 +311,7 @@ std::unique_ptr get_chunked_audio_process( } default: TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); + false, "Unexpected audio type:", libav().av_get_sample_fmt_name(fmt)); } } @@ -321,7 +325,7 @@ std::unique_ptr get_unchunked_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + libav().av_get_media_type_string(i.type)); auto h = i.height; auto w = i.width; @@ -375,7 +379,7 @@ std::unique_ptr get_unchunked_video_process( } default: { TORCH_INTERNAL_ASSERT( - false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); + false, "Unexpected video format found: ", libav().av_get_pix_fmt_name(fmt)); } } } @@ -393,7 +397,7 @@ std::unique_ptr get_unchunked_cuda_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + libav().av_get_media_type_string(i.type)); using B = UnchunkedBuffer; switch (auto fmt = (AVPixelFormat)i.format; fmt) { @@ -416,13 +420,13 @@ std::unique_ptr get_unchunked_cuda_video_process( TORCH_CHECK( false, "Unsupported video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); } default: { TORCH_CHECK( false, "Unexpected video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); } } #endif @@ -437,7 +441,7 @@ std::unique_ptr get_chunked_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + libav().av_get_media_type_string(i.type)); auto h = i.height; auto w = i.width; @@ -491,7 +495,7 @@ std::unique_ptr get_chunked_video_process( } default: { TORCH_INTERNAL_ASSERT( - false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); + false, "Unexpected video format found: ", libav().av_get_pix_fmt_name(fmt)); } } } @@ -511,7 +515,7 @@ std::unique_ptr get_chunked_cuda_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + libav().av_get_media_type_string(i.type)); using B = ChunkedBuffer; switch (auto fmt = (AVPixelFormat)i.format; fmt) { @@ -540,13 +544,13 @@ std::unique_ptr get_chunked_cuda_video_process( TORCH_CHECK( false, "Unsupported video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); } default: { TORCH_CHECK( false, "Unexpected video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + libav().av_get_pix_fmt_name(fmt)); } } #endif diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp index a0bf22a0650..272975553d1 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -6,6 +7,8 @@ namespace torchaudio { namespace io { +using detail::libav; + namespace { AVCodecContextPtr alloc_codec_context( enum AVCodecID codec_id, @@ -13,24 +16,24 @@ AVCodecContextPtr alloc_codec_context( const AVCodec* codec = [&]() { if (decoder_name) { const AVCodec* c = - avcodec_find_decoder_by_name(decoder_name.value().c_str()); + libav().avcodec_find_decoder_by_name(decoder_name.value().c_str()); TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value()); return c; } else { - const AVCodec* c = avcodec_find_decoder(codec_id); - TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id)); + const AVCodec* c = libav().avcodec_find_decoder(codec_id); + TORCH_CHECK(c, "Unsupported codec: ", libav().avcodec_get_name(codec_id)); return c; } }(); - AVCodecContext* codec_ctx = avcodec_alloc_context3(codec); + AVCodecContext* codec_ctx = libav().avcodec_alloc_context3(codec); TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext."); return AVCodecContextPtr(codec_ctx); } const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) { for (int i = 0;; ++i) { - const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i); + const AVCodecHWConfig* config = libav().avcodec_get_hw_config(codec, i); if (!config) { break; } @@ -83,7 +86,7 @@ enum AVPixelFormat get_hw_format( } AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { - AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); + AVBufferRef* p = libav().av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); TORCH_CHECK( p, "Failed to allocate CUDA frame context from device context at ", @@ -94,11 +97,11 @@ AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { frames_ctx->width = codec_ctx->width; frames_ctx->height = codec_ctx->height; frames_ctx->initial_pool_size = 5; - int ret = av_hwframe_ctx_init(p); + int ret = libav().av_hwframe_ctx_init(p); if (ret >= 0) { return p; } - av_buffer_unref(&p); + libav().av_buffer_unref(&p); TORCH_CHECK( false, "Failed to initialize CUDA frame context: ", av_err2string(ret)); } @@ -107,7 +110,7 @@ void configure_codec_context( AVCodecContext* codec_ctx, const AVCodecParameters* params, const torch::Device& device) { - int ret = avcodec_parameters_to_context(codec_ctx, params); + int ret = libav().avcodec_parameters_to_context(codec_ctx, params); TORCH_CHECK( ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret)); @@ -122,7 +125,7 @@ void configure_codec_context( // 2. Set pCodecContext->get_format call back function which // will retrieve the HW pixel format from opaque pointer. codec_ctx->get_format = get_hw_format; - codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); + codec_ctx->hw_device_ctx = libav().av_buffer_ref(get_cuda_context(device.index())); TORCH_INTERNAL_ASSERT( codec_ctx->hw_device_ctx, "Failed to reference HW device context."); #endif @@ -135,16 +138,16 @@ void open_codec( AVDictionary* opts = get_option_dict(decoder_option); // Default to single thread execution. - if (!av_dict_get(opts, "threads", nullptr, 0)) { - av_dict_set(&opts, "threads", "1", 0); + if (!libav().av_dict_get(opts, "threads", nullptr, 0)) { + libav().av_dict_set(&opts, "threads", "1", 0); } if (!codec_ctx->channel_layout) { codec_ctx->channel_layout = - av_get_default_channel_layout(codec_ctx->channels); + libav().av_get_default_channel_layout(codec_ctx->channels); } - int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts); + int ret = libav().avcodec_open2(codec_ctx, codec_ctx->codec, &opts); clean_up_dict(opts); TORCH_CHECK( ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret)); @@ -260,7 +263,7 @@ void StreamProcessor::remove_stream(KeyType key) { void StreamProcessor::set_discard_timestamp(int64_t timestamp) { TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative."); discard_before_pts = - av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base); + libav().av_rescale_q(timestamp, libav().av_get_time_base_q(), stream_time_base); } void StreamProcessor::set_decoder( @@ -306,9 +309,9 @@ int StreamProcessor::process_packet(AVPacket* packet) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( is_decoder_set(), "Decoder must have been set prior to calling this function."); - int ret = avcodec_send_packet(codec_ctx, packet); + int ret = libav().avcodec_send_packet(codec_ctx, packet); while (ret >= 0) { - ret = avcodec_receive_frame(codec_ctx, frame); + ret = libav().avcodec_receive_frame(codec_ctx, frame); // AVERROR(EAGAIN) means that new input data is required to return new // output. if (ret == AVERROR(EAGAIN)) @@ -355,7 +358,7 @@ int StreamProcessor::process_packet(AVPacket* packet) { } // else we can just unref the frame and continue - av_frame_unref(frame); + libav().av_frame_unref(frame); } return ret; } @@ -364,7 +367,7 @@ void StreamProcessor::flush() { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( is_decoder_set(), "Decoder must have been set prior to calling this function."); - avcodec_flush_buffers(codec_ctx); + libav().avcodec_flush_buffers(codec_ctx); for (auto& ite : post_processes) { ite.second->flush(); } diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp index 0eec327aa51..5a2143e2730 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp @@ -1,13 +1,19 @@ #include +#include #include #include #include #include #include +extern "C" { +#include +} + namespace torchaudio { namespace io { +using detail::libav; using KeyType = StreamProcessor::KeyType; ////////////////////////////////////////////////////////////////////////////// @@ -19,7 +25,7 @@ AVFormatContext* get_input_format_context( const c10::optional& format, const c10::optional& option, AVIOContext* io_ctx) { - AVFormatContext* p = avformat_alloc_context(); + AVFormatContext* p = libav().avformat_alloc_context(); TORCH_CHECK(p, "Failed to allocate AVFormatContext."); if (io_ctx) { p->pb = io_ctx; @@ -29,7 +35,7 @@ AVFormatContext* get_input_format_context( if (format.has_value()) { std::string format_str = format.value(); AVFORMAT_CONST AVInputFormat* pInput = - av_find_input_format(format_str.c_str()); + libav().av_find_input_format(format_str.c_str()); TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\""); return pInput; } @@ -37,7 +43,7 @@ AVFormatContext* get_input_format_context( }(); AVDictionary* opt = get_option_dict(option); - int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt); + int ret = libav().avformat_open_input(&p, src.c_str(), pInputFormat, &opt); clean_up_dict(opt); TORCH_CHECK( @@ -53,7 +59,7 @@ AVFormatContext* get_input_format_context( StreamReader::StreamReader(AVFormatContext* p) : format_ctx(p) { C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamReader"); - int ret = avformat_find_stream_info(format_ctx, nullptr); + int ret = libav().avformat_find_stream_info(format_ctx, nullptr); TORCH_CHECK( ret >= 0, "Failed to find stream information: ", av_err2string(ret)); @@ -110,7 +116,7 @@ void validate_src_stream_type( "Stream ", i, " is not ", - av_get_media_type_string(type), + libav().av_get_media_type_string(type), " stream."); } @@ -125,7 +131,7 @@ namespace { OptionDict parse_metadata(const AVDictionary* metadata) { AVDictionaryEntry* tag = nullptr; OptionDict ret; - while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { + while ((tag = libav().av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { ret.emplace(std::string(tag->key), std::string(tag->value)); } return ret; @@ -148,7 +154,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { ret.num_frames = stream->nb_frames; ret.bits_per_sample = codecpar->bits_per_raw_sample; ret.metadata = parse_metadata(stream->metadata); - const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id); + const AVCodecDescriptor* desc = libav().avcodec_descriptor_get(codecpar->codec_id); if (desc) { ret.codec_name = desc->name; ret.codec_long_name = desc->long_name; @@ -158,7 +164,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { case AVMEDIA_TYPE_AUDIO: { AVSampleFormat smp_fmt = static_cast(codecpar->format); if (smp_fmt != AV_SAMPLE_FMT_NONE) { - ret.fmt_name = av_get_sample_fmt_name(smp_fmt); + ret.fmt_name = libav().av_get_sample_fmt_name(smp_fmt); } ret.sample_rate = static_cast(codecpar->sample_rate); ret.num_channels = codecpar->channels; @@ -167,7 +173,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { case AVMEDIA_TYPE_VIDEO: { AVPixelFormat pix_fmt = static_cast(codecpar->format); if (pix_fmt != AV_PIX_FMT_NONE) { - ret.fmt_name = av_get_pix_fmt_name(pix_fmt); + ret.fmt_name = libav().av_get_pix_fmt_name(pix_fmt); } ret.width = codecpar->width; ret.height = codecpar->height; @@ -181,7 +187,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { namespace { AVCodecParameters* get_codecpar() { - AVCodecParameters* ptr = avcodec_parameters_alloc(); + AVCodecParameters* ptr = libav().avcodec_parameters_alloc(); TORCH_CHECK(ptr, "Failed to allocate resource."); return ptr; } @@ -192,7 +198,7 @@ StreamParams StreamReader::get_src_stream_params(int i) { AVStream* stream = format_ctx->streams[i]; AVCodecParametersPtr codec_params(get_codecpar()); - int ret = avcodec_parameters_copy(codec_params, stream->codecpar); + int ret = libav().avcodec_parameters_copy(codec_params, stream->codecpar); TORCH_CHECK( ret >= 0, "Failed to copy the stream's codec parameters. (", @@ -234,12 +240,12 @@ OutputStreamInfo StreamReader::get_out_stream_info(int i) const { } int64_t StreamReader::find_best_audio_stream() const { - return av_find_best_stream( + return libav().av_find_best_stream( format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0); } int64_t StreamReader::find_best_video_stream() const { - return av_find_best_stream( + return libav().av_find_best_stream( format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); } @@ -289,7 +295,7 @@ void StreamReader::seek(double timestamp_s, int64_t mode) { TORCH_CHECK(false, "Invalid mode value: ", mode); } - int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); + int ret = libav().av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); if (ret < 0) { seek_timestamp = 0; @@ -402,12 +408,12 @@ void StreamReader::add_stream( case AVMEDIA_TYPE_AUDIO: return AVRational{0, 1}; case AVMEDIA_TYPE_VIDEO: - return av_guess_frame_rate(format_ctx, stream, nullptr); + return libav().av_guess_frame_rate(format_ctx, stream, nullptr); default: TORCH_INTERNAL_ASSERT( false, "Unexpected media type is given: ", - av_get_media_type_string(media_type)); + libav().av_get_media_type_string(media_type)); } }(); int key = processors[i]->add_stream( @@ -446,7 +452,7 @@ void StreamReader::remove_stream(int64_t i) { // 1: It's done, caller should stop calling // <0: Some error happened int StreamReader::process_packet() { - int ret = av_read_frame(format_ctx, packet); + int ret = libav().av_read_frame(format_ctx, packet); if (ret == AVERROR_EOF) { ret = drain(); return (ret < 0) ? ret : 1; @@ -577,12 +583,12 @@ AVIOContext* get_io_context( int buffer_size, int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = static_cast(av_malloc(buffer_size)); + unsigned char* buffer = static_cast(libav().av_malloc(buffer_size)); TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = avio_alloc_context( + AVIOContext* io_ctx = libav().avio_alloc_context( buffer, buffer_size, 0, opaque, read_packet, nullptr, seek); if (!io_ctx) { - av_freep(&buffer); + libav().av_freep(&buffer); TORCH_CHECK(false, "Failed to allocate AVIOContext."); } return io_ctx; diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp index a0e18fb8d41..7269dfc4fc6 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp @@ -1,9 +1,16 @@ +#include #include #include #include +extern "C" { +#include +} + namespace torchaudio::io { +using detail::libav; + //////////////////////////////////////////////////////////////////////////////// // EncodeProcess Logic Implementation //////////////////////////////////////////////////////////////////////////////// @@ -56,7 +63,7 @@ void EncodeProcess::process_frame(AVFrame* src) { if (ret >= 0) { encoder.encode(dst_frame); } - av_frame_unref(dst_frame); + libav().av_frame_unref(dst_frame); } } @@ -71,8 +78,8 @@ void EncodeProcess::flush() { namespace { enum AVSampleFormat get_src_sample_fmt(const std::string& src) { - auto fmt = av_get_sample_fmt(src.c_str()); - if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) { + auto fmt = libav().av_get_sample_fmt(src.c_str()); + if (fmt != AV_SAMPLE_FMT_NONE && !libav().av_sample_fmt_is_planar(fmt)) { return fmt; } TORCH_CHECK( @@ -89,7 +96,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) { AV_SAMPLE_FMT_S64, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_DBL}) { - ret.emplace_back(av_get_sample_fmt_name(fmt)); + ret.emplace_back(libav().av_get_sample_fmt_name(fmt)); } return c10::Join(", ", ret); }(), @@ -97,7 +104,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) { } enum AVPixelFormat get_src_pix_fmt(const std::string& src) { - AVPixelFormat fmt = av_get_pix_fmt(src.c_str()); + AVPixelFormat fmt = libav().av_get_pix_fmt(src.c_str()); switch (fmt) { case AV_PIX_FMT_GRAY8: case AV_PIX_FMT_RGB24: @@ -118,7 +125,7 @@ enum AVPixelFormat get_src_pix_fmt(const std::string& src) { AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, AV_PIX_FMT_YUV444P}) { - ret.emplace_back(av_get_pix_fmt_name(fmt)); + ret.emplace_back(libav().av_get_pix_fmt_name(fmt)); } return c10::Join(", ", ret); }(), @@ -132,18 +139,18 @@ const AVCodec* get_codec( AVCodecID default_codec, const c10::optional& encoder) { if (encoder) { - const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str()); + const AVCodec* c = libav().avcodec_find_encoder_by_name(encoder.value().c_str()); TORCH_CHECK(c, "Unexpected codec: ", encoder.value()); return c; } - const AVCodec* c = avcodec_find_encoder(default_codec); + const AVCodec* c = libav().avcodec_find_encoder(default_codec); TORCH_CHECK( - c, "Encoder not found for codec: ", avcodec_get_name(default_codec)); + c, "Encoder not found for codec: ", libav().avcodec_get_name(default_codec)); return c; } AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) { - AVCodecContext* ctx = avcodec_alloc_context3(codec); + AVCodecContext* ctx = libav().avcodec_alloc_context3(codec); TORCH_CHECK(ctx, "Failed to allocate CodecContext."); if (flags & AVFMT_GLOBALHEADER) { @@ -169,25 +176,25 @@ void open_codec( // while "libopus" refers to the one depends on libopusenc // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251 if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) { - if (!av_dict_get(opt, "strict", nullptr, 0)) { + if (!libav().av_dict_get(opt, "strict", nullptr, 0)) { TORCH_WARN_ONCE( "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ", "If this is not desired, please provide \"strict\" encoder option ", "with desired value."); - av_dict_set(&opt, "strict", "experimental", 0); + libav().av_dict_set(&opt, "strict", "experimental", 0); } } if (std::strcmp(codec_ctx->codec->name, "opus") == 0) { - if (!av_dict_get(opt, "strict", nullptr, 0)) { + if (!libav().av_dict_get(opt, "strict", nullptr, 0)) { TORCH_WARN_ONCE( "\"opus\" encoder is selected. Enabling '-strict experimental'. ", "If this is not desired, please provide \"strict\" encoder option ", "with desired value."); - av_dict_set(&opt, "strict", "experimental", 0); + libav().av_dict_set(&opt, "strict", "experimental", 0); } } - int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt); + int ret = libav().avcodec_open2(codec_ctx, codec_ctx->codec, &opt); clean_up_dict(opt); TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")"); } @@ -214,7 +221,7 @@ bool supported_sample_fmt( std::string get_supported_formats(const AVSampleFormat* sample_fmts) { std::vector ret; while (*sample_fmts != AV_SAMPLE_FMT_NONE) { - ret.emplace_back(av_get_sample_fmt_name(*sample_fmts)); + ret.emplace_back(libav().av_get_sample_fmt_name(*sample_fmts)); ++sample_fmts; } return c10::Join(", ", ret); @@ -226,7 +233,7 @@ AVSampleFormat get_enc_fmt( const AVCodec* codec) { if (encoder_format) { auto& enc_fmt_val = encoder_format.value(); - auto fmt = av_get_sample_fmt(enc_fmt_val.c_str()); + auto fmt = libav().av_get_sample_fmt(enc_fmt_val.c_str()); TORCH_CHECK( fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val); TORCH_CHECK( @@ -313,8 +320,8 @@ std::string get_supported_channels(const uint64_t* channel_layouts) { std::vector names; while (*channel_layouts) { std::stringstream ss; - ss << av_get_channel_layout_nb_channels(*channel_layouts); - ss << " (" << av_get_channel_name(*channel_layouts) << ")"; + ss << libav().av_get_channel_layout_nb_channels(*channel_layouts); + ss << " (" << libav().av_get_channel_name(*channel_layouts) << ")"; names.emplace_back(ss.str()); ++channel_layouts; } @@ -331,10 +338,10 @@ uint64_t get_channel_layout( TORCH_CHECK( val > 0, "The number of channels must be greater than 0. Found: ", val); if (!codec->channel_layouts) { - return static_cast(av_get_default_channel_layout(val)); + return static_cast(libav().av_get_default_channel_layout(val)); } for (const uint64_t* it = codec->channel_layouts; *it; ++it) { - if (av_get_channel_layout_nb_channels(*it) == val) { + if (libav().av_get_channel_layout_nb_channels(*it) == val) { return *it; } } @@ -371,8 +378,8 @@ void configure_audio_codec_ctx( const c10::optional& codec_config) { codec_ctx->sample_fmt = format; codec_ctx->sample_rate = sample_rate; - codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24)); - codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout); + codec_ctx->time_base = av_inv_q(libav().av_d2q(sample_rate, 1 << 24)); + codec_ctx->channels = libav().av_get_channel_layout_nb_channels(channel_layout); codec_ctx->channel_layout = channel_layout; // Set optional stuff @@ -411,7 +418,7 @@ bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) { std::string get_supported_formats(const AVPixelFormat* pix_fmts) { std::vector ret; while (*pix_fmts != AV_PIX_FMT_NONE) { - ret.emplace_back(av_get_pix_fmt_name(*pix_fmts)); + ret.emplace_back(libav().av_get_pix_fmt_name(*pix_fmts)); ++pix_fmts; } return c10::Join(", ", ret); @@ -423,7 +430,7 @@ AVPixelFormat get_enc_fmt( const AVCodec* codec) { if (encoder_format) { const auto& val = encoder_format.value(); - auto fmt = av_get_pix_fmt(val.c_str()); + auto fmt = libav().av_get_pix_fmt(val.c_str()); TORCH_CHECK( supported_pix_fmt(fmt, codec->pix_fmts), codec->name, @@ -461,7 +468,7 @@ AVRational get_enc_rate( std::isfinite(enc_rate) && enc_rate > 0, "Encoder sample rate must be positive and fininte. Found: ", enc_rate); - AVRational rate = av_d2q(enc_rate, 1 << 24); + AVRational rate = libav().av_d2q(enc_rate, 1 << 24); TORCH_CHECK( supported_frame_rate(rate, codec->supported_framerates), codec->name, @@ -545,14 +552,14 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { // context to AVCodecContext. But this way, it will be deallocated // automatically at the time AVCodecContext is freed, so we do that. - ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); + ctx->hw_device_ctx = libav().av_buffer_ref(get_cuda_context(device.index())); TORCH_INTERNAL_ASSERT( ctx->hw_device_ctx, "Failed to reference HW device context."); ctx->sw_pix_fmt = ctx->pix_fmt; ctx->pix_fmt = AV_PIX_FMT_CUDA; - ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx); + ctx->hw_frames_ctx = libav().av_hwframe_ctx_alloc(ctx->hw_device_ctx); TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context."); auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data); @@ -562,7 +569,7 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { frames_ctx->height = ctx->height; frames_ctx->initial_pool_size = 5; - int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx); + int ret = libav().av_hwframe_ctx_init(ctx->hw_frames_ctx); TORCH_CHECK( ret >= 0, "Failed to initialize CUDA frame context: ", @@ -574,11 +581,11 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { //////////////////////////////////////////////////////////////////////////////// AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) { - AVStream* stream = avformat_new_stream(format_ctx, nullptr); + AVStream* stream = libav().avformat_new_stream(format_ctx, nullptr); TORCH_CHECK(stream, "Failed to allocate stream."); stream->time_base = codec_ctx->time_base; - int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx); + int ret = libav().avcodec_parameters_from_context(stream->codecpar, codec_ctx); TORCH_CHECK( ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret)); return stream; @@ -605,7 +612,7 @@ FilterGraph get_audio_filter_graph( if (filter_desc || src_fmt != enc_fmt || src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) { std::stringstream ss; - ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt) + ss << "aformat=sample_fmts=" << libav().av_get_sample_fmt_name(enc_fmt) << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x" << std::hex << enc_ch_layout; parts.push_back(ss.str()); @@ -656,7 +663,7 @@ FilterGraph get_video_filter_graph( } if (filter_desc || src_fmt != enc_fmt) { std::stringstream ss; - ss << "format=" << av_get_pix_fmt_name(enc_fmt); + ss << "format=" << libav().av_get_pix_fmt_name(enc_fmt); parts.emplace_back(ss.str()); } if (filter_desc || @@ -695,7 +702,7 @@ AVFramePtr get_audio_frame( frame->channel_layout = channel_layout; frame->sample_rate = sample_rate; frame->nb_samples = nb_samples; - int ret = av_frame_get_buffer(frame, 0); + int ret = libav().av_frame_get_buffer(frame, 0); TORCH_CHECK( ret >= 0, "Error allocating the source audio frame:", av_err2string(ret)); @@ -711,7 +718,7 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) { frame->format = src_fmt; frame->width = width; frame->height = height; - int ret = av_frame_get_buffer(frame, 0); + int ret = libav().av_frame_get_buffer(frame, 0); TORCH_CHECK( ret >= 0, "Error allocating a video buffer :", av_err2string(ret)); @@ -756,10 +763,10 @@ EncodeProcess get_audio_encode_process( // case, restrictions on the format to support tensor inputs do not apply, and // so we directly get the format via FFmpeg. const AVSampleFormat src_fmt = (disable_converter) - ? av_get_sample_fmt(format.c_str()) + ? libav().av_get_sample_fmt(format.c_str()) : get_src_sample_fmt(format); const auto src_ch_layout = - static_cast(av_get_default_channel_layout(src_num_channels)); + static_cast(libav().av_get_default_channel_layout(src_num_channels)); // 2. Fetch codec from default or override TORCH_CHECK( @@ -779,7 +786,7 @@ EncodeProcess get_audio_encode_process( // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277 // This is the case for at least until FFmpeg 6.0, so it will be // like this for a while. - return static_cast(av_get_default_channel_layout(2)); + return static_cast(libav().av_get_default_channel_layout(2)); } return get_channel_layout(src_ch_layout, encoder_num_channels, codec); }(); @@ -867,9 +874,9 @@ EncodeProcess get_video_encode_process( // case, restrictions on the format to support tensor inputs do not apply, and // so we directly get the format via FFmpeg. const AVPixelFormat src_fmt = (disable_converter) - ? av_get_pix_fmt(format.c_str()) + ? libav().av_get_pix_fmt(format.c_str()) : get_src_pix_fmt(format); - const AVRational src_rate = av_d2q(frame_rate, 1 << 24); + const AVRational src_rate = libav().av_d2q(frame_rate, 1 << 24); // 2. Fetch codec from default or override TORCH_CHECK( @@ -936,7 +943,7 @@ EncodeProcess get_video_encode_process( AVFramePtr src_frame = [&]() { if (codec_ctx->hw_frames_ctx) { AVFramePtr frame{alloc_avframe()}; - int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); + int ret = libav().av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret)); frame->nb_samples = 1; frame->pts = 0; diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp index 3d2e5015357..038ae47ce36 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp @@ -1,7 +1,10 @@ +#include #include namespace torchaudio::io { +using detail::libav; + Encoder::Encoder( AVFormatContext* format_ctx, AVCodecContext* codec_ctx, @@ -13,10 +16,10 @@ Encoder::Encoder( /// /// @param frame Frame data to encode void Encoder::encode(AVFrame* frame) { - int ret = avcodec_send_frame(codec_ctx, frame); + int ret = libav().avcodec_send_frame(codec_ctx, frame); TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ")."); while (ret >= 0) { - ret = avcodec_receive_packet(codec_ctx, packet); + ret = libav().avcodec_receive_packet(codec_ctx, packet); if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { if (ret == AVERROR_EOF) { // Note: @@ -31,7 +34,7 @@ void Encoder::encode(AVFrame* frame) { // An alternative is to use `av_write_frame` functoin, but in that case // client code is responsible for ordering packets, which makes it // complicated to use StreamWriter - ret = av_interleaved_write_frame(format_ctx, nullptr); + ret = libav().av_interleaved_write_frame(format_ctx, nullptr); TORCH_CHECK( ret >= 0, "Failed to flush packet (", av_err2string(ret), ")."); } @@ -51,10 +54,10 @@ void Encoder::encode(AVFrame* frame) { // This has to be set before av_packet_rescale_ts bellow. packet->duration = 1; } - av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base); + libav().av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base); packet->stream_index = stream->index; - ret = av_interleaved_write_frame(format_ctx, packet); + ret = libav().av_interleaved_write_frame(format_ctx, packet); TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ")."); } } diff --git a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp index 0701c5a5965..773081987a8 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp @@ -1,13 +1,17 @@ +#include #include namespace torchaudio::io { + +using detail::libav; + namespace { AVStream* add_stream( AVFormatContext* format_ctx, const StreamParams& stream_params) { - AVStream* stream = avformat_new_stream(format_ctx, nullptr); + AVStream* stream = libav().avformat_new_stream(format_ctx, nullptr); int ret = - avcodec_parameters_copy(stream->codecpar, stream_params.codec_params); + libav().avcodec_parameters_copy(stream->codecpar, stream_params.codec_params); TORCH_CHECK( ret >= 0, "Failed to copy the stream's codec parameters. (", @@ -26,11 +30,11 @@ PacketWriter::PacketWriter( void PacketWriter::write_packet(const AVPacketPtr& packet) { AVPacket dst_packet; - int ret = av_packet_ref(&dst_packet, packet); + int ret = libav().av_packet_ref(&dst_packet, packet); TORCH_CHECK(ret >= 0, "Failed to copy packet."); - av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base); + libav().av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base); dst_packet.stream_index = stream->index; - ret = av_interleaved_write_frame(format_ctx, &dst_packet); + ret = libav().av_interleaved_write_frame(format_ctx, &dst_packet); TORCH_CHECK(ret >= 0, "Failed to write packet to destination."); } } // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp index df51d92355c..7aacd013994 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp @@ -1,11 +1,14 @@ +#include #include #ifdef USE_CUDA #include #endif -namespace torchaudio { -namespace io { +namespace torchaudio::io { + +using detail::libav; + namespace { AVFormatContext* get_output_format_context( @@ -19,7 +22,7 @@ AVFormatContext* get_output_format_context( } AVFormatContext* p = nullptr; - int ret = avformat_alloc_output_context2( + int ret = libav().avformat_alloc_output_context2( &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str()); TORCH_CHECK( ret >= 0, @@ -208,14 +211,14 @@ void StreamWriter::add_video_frame_stream( } void StreamWriter::set_metadata(const OptionDict& metadata) { - av_dict_free(&format_ctx->metadata); + libav().av_dict_free(&format_ctx->metadata); for (auto const& [key, value] : metadata) { - av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); + libav().av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); } } void StreamWriter::dump_format(int64_t i) { - av_dump_format(format_ctx, (int)i, format_ctx->url, 1); + libav().av_dump_format(format_ctx, (int)i, format_ctx->url, 1); } void StreamWriter::open(const c10::optional& option) { @@ -231,10 +234,10 @@ void StreamWriter::open(const c10::optional& option) { AVDictionary* opt = get_option_dict(option); if (!(fmt->flags & AVFMT_NOFILE) && !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { - ret = avio_open2( + ret = libav().avio_open2( &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt); if (ret < 0) { - av_dict_free(&opt); + libav().av_dict_free(&opt); TORCH_CHECK( false, "Failed to open dst: ", @@ -245,7 +248,7 @@ void StreamWriter::open(const c10::optional& option) { } } - ret = avformat_write_header(format_ctx, &opt); + ret = libav().avformat_write_header(format_ctx, &opt); clean_up_dict(opt); TORCH_CHECK( ret >= 0, @@ -258,7 +261,7 @@ void StreamWriter::open(const c10::optional& option) { } void StreamWriter::close() { - int ret = av_write_trailer(format_ctx); + int ret = libav().av_write_trailer(format_ctx); if (ret < 0) { LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ")."; } @@ -269,7 +272,7 @@ void StreamWriter::close() { if (!(fmt->flags & AVFMT_NOFILE) && !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { // avio_closep can be only applied to AVIOContext opened by avio_open - avio_closep(&(format_ctx->pb)); + libav().avio_closep(&(format_ctx->pb)); } is_open = false; } @@ -355,12 +358,12 @@ AVIOContext* get_io_context( int buffer_size, int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = static_cast(av_malloc(buffer_size)); + unsigned char* buffer = static_cast(libav().av_malloc(buffer_size)); TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = avio_alloc_context( + AVIOContext* io_ctx = libav().avio_alloc_context( buffer, buffer_size, 1, opaque, nullptr, write_packet, seek); if (!io_ctx) { - av_freep(&buffer); + libav().av_freep(&buffer); TORCH_CHECK(false, "Failed to allocate AVIOContext."); } return io_ctx; @@ -384,5 +387,4 @@ StreamWriterCustomIO::StreamWriterCustomIO( : CustomOutput(opaque, buffer_size, write_packet, seek), StreamWriter(io_ctx, format) {} -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp index e9350f0479a..00ae55a6b77 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp @@ -1,3 +1,4 @@ +#include #include #ifdef USE_CUDA @@ -6,6 +7,8 @@ namespace torchaudio::io { +using detail::libav; + namespace { using InitFunc = TensorConverter::InitFunc; @@ -41,8 +44,8 @@ void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); + if (!libav().av_frame_is_writable(buffer)) { + int ret = libav().av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -145,8 +148,8 @@ void write_interlaced_video( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); + if (!libav().av_frame_is_writable(buffer)) { + int ret = libav().av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -187,7 +190,7 @@ void write_planar_video( AVFrame* buffer, int num_planes) { const auto num_colors = - av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; + libav().av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors); @@ -195,8 +198,8 @@ void write_planar_video( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); + if (!libav().av_frame_is_writable(buffer)) { + int ret = libav().av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -308,7 +311,7 @@ std::pair get_video_func(AVFrame* buffer) { TORCH_CHECK( false, "Unexpected pixel format for CUDA: ", - av_get_pix_fmt_name(sw_pix_fmt)); + libav().av_get_pix_fmt_name(sw_pix_fmt)); } } @@ -317,7 +320,7 @@ std::pair get_video_func(AVFrame* buffer) { case AV_PIX_FMT_GRAY8: case AV_PIX_FMT_RGB24: case AV_PIX_FMT_BGR24: { - int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components; + int channels = libav().av_pix_fmt_desc_get(pix_fmt)->nb_components; InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) { validate_video_input(t, f, channels); return init_interlaced(t); @@ -339,7 +342,7 @@ std::pair get_video_func(AVFrame* buffer) { } default: TORCH_CHECK( - false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt)); + false, "Unexpected pixel format: ", libav().av_get_pix_fmt_name(pix_fmt)); } } @@ -383,7 +386,7 @@ TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size) break; default: TORCH_INTERNAL_ASSERT( - false, "Unsupported media type: ", av_get_media_type_string(type)); + false, "Unsupported media type: ", libav().av_get_media_type_string(type)); } }