From bf6b01b0d53b5f9f0213abe52681ebfcbbd0211f Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:39:04 -0700 Subject: [PATCH] [audio][PR] Add option to dlopen FFmpeg libraries (#3402) Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3402 This is a second attempt of https://github.com/pytorch/audio/pull/3353. The basic logic to enable dlopen for FFmpeg libraries are same. It uses `at::DynamicLibrary`, which allows to compile torchaudio without linking FFmpeg libraries. This time, the option to enable this feature DLOPEN_FFMPEG has been added, so that users have a way to disable this feature and keep using build-time linking. Please refer to stub.h for more technical detail. Differential Revision: D46403783 fbshipit-source-id: 28758ed3b40e4de418c9f7014bd29e35cefbe3ec --- tools/setup_helpers/extension.py | 2 + torchaudio/csrc/ffmpeg/CMakeLists.txt | 18 +- torchaudio/csrc/ffmpeg/ffmpeg.cpp | 47 +-- torchaudio/csrc/ffmpeg/ffmpeg.h | 5 +- torchaudio/csrc/ffmpeg/filter_graph.cpp | 46 +-- torchaudio/csrc/ffmpeg/hw_context.cpp | 3 +- torchaudio/csrc/ffmpeg/pybind/pybind.cpp | 56 ++-- .../csrc/ffmpeg/stream_reader/conversion.cpp | 13 +- .../ffmpeg/stream_reader/packet_buffer.cpp | 10 +- .../ffmpeg/stream_reader/post_process.cpp | 37 ++- .../ffmpeg/stream_reader/stream_processor.cpp | 49 ++- .../ffmpeg/stream_reader/stream_reader.cpp | 53 +-- .../ffmpeg/stream_writer/encode_process.cpp | 94 +++--- .../csrc/ffmpeg/stream_writer/encoder.cpp | 12 +- .../ffmpeg/stream_writer/packet_writer.cpp | 14 +- .../ffmpeg/stream_writer/stream_writer.cpp | 32 +- .../ffmpeg/stream_writer/tensor_converter.cpp | 28 +- torchaudio/csrc/ffmpeg/stub.cpp | 196 +++++++++++ torchaudio/csrc/ffmpeg/stub.h | 313 ++++++++++++++++++ 19 files changed, 786 insertions(+), 242 deletions(-) create mode 100644 torchaudio/csrc/ffmpeg/stub.cpp create mode 100644 torchaudio/csrc/ffmpeg/stub.h diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py index 81e36d38041..c72415f0d86 100644 --- a/tools/setup_helpers/extension.py +++ b/tools/setup_helpers/extension.py @@ -38,6 +38,7 @@ def _get_build(var, default=False): _BUILD_RIR = _get_build("BUILD_RIR", True) _BUILD_RNNT = _get_build("BUILD_RNNT", True) _USE_FFMPEG = _get_build("USE_FFMPEG", False) +_DLOPEN_FFMPEG = _get_build("DLOPEN_FFMPEG", True) _USE_ROCM = _get_build("USE_ROCM", torch.backends.cuda.is_built() and torch.version.hip is not None) _USE_CUDA = _get_build("USE_CUDA", torch.backends.cuda.is_built() and torch.version.hip is None) _BUILD_ALIGN = _get_build("BUILD_ALIGN", True) @@ -126,6 +127,7 @@ def build_extension(self, ext): f"-DUSE_CUDA:BOOL={'ON' if _USE_CUDA else 'OFF'}", f"-DUSE_OPENMP:BOOL={'ON' if _USE_OPENMP else 'OFF'}", f"-DUSE_FFMPEG:BOOL={'ON' if _USE_FFMPEG else 'OFF'}", + f"-DDLOPEN_FFMPEG:BOOL={'ON' if _DLOPEN_FFMPEG else 'OFF'}", ] build_args = ["--target", "install"] # Pass CUDA architecture to cmake diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt index e3445265b5c..849d83d62fb 100644 --- a/torchaudio/csrc/ffmpeg/CMakeLists.txt +++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt @@ -2,11 +2,13 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}") find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil) add_library(ffmpeg INTERFACE) target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}") +if (NOT DLOPEN_FFMPEG) target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}") - +endif() set( sources + stub.cpp ffmpeg.cpp filter_graph.cpp hw_context.cpp @@ -31,24 +33,24 @@ if (USE_CUDA) cuda_deps) endif() +if (DLOPEN_FFMPEG) + set(compile_definitions DLOPEN_FFMPEG) +endif() + torchaudio_library( libtorchaudio_ffmpeg "${sources}" "" "torch;ffmpeg;${additional_lib}" - "" + "${compile_definitions}" ) if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) - set( - ext_sources - pybind/pybind.cpp - ) torchaudio_extension( _torchaudio_ffmpeg - "${ext_sources}" + pybind/pybind.cpp "" "libtorchaudio_ffmpeg" - "" + "${compile_definitions}" ) endif () diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp index 66bd222c050..55e6c142b93 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp +++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp @@ -1,12 +1,18 @@ #include #include +#include #include #include #include #include -namespace torchaudio { -namespace io { +namespace torchaudio::io { + +std::string av_err2string(int errnum) { + char str[AV_ERROR_MAX_STRING_SIZE]; + FFMPEG av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE); + return str; +} //////////////////////////////////////////////////////////////////////////////// // AVDictionary @@ -15,7 +21,7 @@ AVDictionary* get_option_dict(const c10::optional& option) { AVDictionary* opt = nullptr; if (option) { for (auto const& [key, value] : option.value()) { - av_dict_set(&opt, key.c_str(), value.c_str(), 0); + FFMPEG av_dict_set(&opt, key.c_str(), value.c_str(), 0); } } return opt; @@ -26,10 +32,10 @@ void clean_up_dict(AVDictionary* p) { std::vector unused_keys; // Check and copy unused keys, clean up the original dictionary AVDictionaryEntry* t = nullptr; - while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { + while ((t = FFMPEG av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { unused_keys.emplace_back(t->key); } - av_dict_free(&p); + FFMPEG av_dict_free(&p); TORCH_CHECK( unused_keys.empty(), "Unexpected options: ", @@ -41,14 +47,14 @@ void clean_up_dict(AVDictionary* p) { // AVFormatContext //////////////////////////////////////////////////////////////////////////////// void AVFormatInputContextDeleter::operator()(AVFormatContext* p) { - avformat_close_input(&p); + FFMPEG avformat_close_input(&p); }; AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p) : Wrapper(p) {} void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) { - avformat_free_context(p); + FFMPEG avformat_free_context(p); }; AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) @@ -58,9 +64,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) // AVIO //////////////////////////////////////////////////////////////////////////////// void AVIOContextDeleter::operator()(AVIOContext* p) { - avio_flush(p); - av_freep(&p->buffer); - av_freep(&p); + FFMPEG avio_flush(p); + FFMPEG av_freep(&p->buffer); + FFMPEG av_freep(&p); }; AVIOContextPtr::AVIOContextPtr(AVIOContext* p) @@ -70,13 +76,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p) // AVPacket //////////////////////////////////////////////////////////////////////////////// void AVPacketDeleter::operator()(AVPacket* p) { - av_packet_free(&p); + FFMPEG av_packet_free(&p); }; AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper(p) {} AVPacketPtr alloc_avpacket() { - AVPacket* p = av_packet_alloc(); + AVPacket* p = FFMPEG av_packet_alloc(); TORCH_CHECK(p, "Failed to allocate AVPacket object."); return AVPacketPtr{p}; } @@ -86,7 +92,7 @@ AVPacketPtr alloc_avpacket() { //////////////////////////////////////////////////////////////////////////////// AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){}; AutoPacketUnref::~AutoPacketUnref() { - av_packet_unref(p_); + FFMPEG av_packet_unref(p_); } AutoPacketUnref::operator AVPacket*() const { return p_; @@ -96,13 +102,13 @@ AutoPacketUnref::operator AVPacket*() const { // AVFrame //////////////////////////////////////////////////////////////////////////////// void AVFrameDeleter::operator()(AVFrame* p) { - av_frame_free(&p); + FFMPEG av_frame_free(&p); }; AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper(p) {} AVFramePtr alloc_avframe() { - AVFrame* p = av_frame_alloc(); + AVFrame* p = FFMPEG av_frame_alloc(); TORCH_CHECK(p, "Failed to allocate AVFrame object."); return AVFramePtr{p}; }; @@ -111,7 +117,7 @@ AVFramePtr alloc_avframe() { // AVCodecContext //////////////////////////////////////////////////////////////////////////////// void AVCodecContextDeleter::operator()(AVCodecContext* p) { - avcodec_free_context(&p); + FFMPEG avcodec_free_context(&p); }; AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) @@ -121,7 +127,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) // AVBufferRefPtr //////////////////////////////////////////////////////////////////////////////// void AutoBufferUnref::operator()(AVBufferRef* p) { - av_buffer_unref(&p); + FFMPEG av_buffer_unref(&p); } AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) @@ -131,7 +137,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) // AVFilterGraph //////////////////////////////////////////////////////////////////////////////// void AVFilterGraphDeleter::operator()(AVFilterGraph* p) { - avfilter_graph_free(&p); + FFMPEG avfilter_graph_free(&p); }; AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) @@ -141,11 +147,10 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) // AVCodecParameters //////////////////////////////////////////////////////////////////////////////// void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) { - avcodec_parameters_free(&codecpar); + FFMPEG avcodec_parameters_free(&codecpar); } AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p) : Wrapper(p) {} -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h index 0bae00c12d8..83d18464fa3 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.h +++ b/torchaudio/csrc/ffmpeg/ffmpeg.h @@ -41,10 +41,7 @@ using OptionDict = std::map; // Replacement of av_err2str, which causes // `error: taking address of temporary array` // https://github.com/joncampbell123/composite-video-simulator/issues/5 -av_always_inline std::string av_err2string(int errnum) { - char str[AV_ERROR_MAX_STRING_SIZE]; - return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum); -} +std::string av_err2string(int errnum); // Base structure that handles memory management. // Resource is freed by the destructor of unique_ptr, diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp index 797f0783494..faa3606e084 100644 --- a/torchaudio/csrc/ffmpeg/filter_graph.cpp +++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp @@ -1,12 +1,12 @@ #include +#include #include -namespace torchaudio { -namespace io { +namespace torchaudio::io { namespace { AVFilterGraph* get_filter_graph() { - AVFilterGraph* ptr = avfilter_graph_alloc(); + AVFilterGraph* ptr = FFMPEG avfilter_graph_alloc(); TORCH_CHECK(ptr, "Failed to allocate resouce."); ptr->nb_threads = 1; return ptr; @@ -32,7 +32,7 @@ std::string get_audio_src_args( time_base.num, time_base.den, sample_rate, - av_get_sample_fmt_name(format), + FFMPEG av_get_sample_fmt_name(format), channel_layout); return std::string(args); } @@ -51,7 +51,7 @@ std::string get_video_src_args( "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d", width, height, - av_get_pix_fmt_name(format), + FFMPEG av_get_pix_fmt_name(format), time_base.num, time_base.den, frame_rate.num, @@ -69,7 +69,7 @@ void FilterGraph::add_audio_src( int sample_rate, uint64_t channel_layout) { add_src( - avfilter_get_by_name("abuffer"), + FFMPEG avfilter_get_by_name("abuffer"), get_audio_src_args(format, time_base, sample_rate, channel_layout)); } @@ -81,13 +81,13 @@ void FilterGraph::add_video_src( int height, AVRational sample_aspect_ratio) { add_src( - avfilter_get_by_name("buffer"), + FFMPEG avfilter_get_by_name("buffer"), get_video_src_args( format, time_base, frame_rate, width, height, sample_aspect_ratio)); } void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { - int ret = avfilter_graph_create_filter( + int ret = FFMPEG avfilter_graph_create_filter( &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph); TORCH_CHECK( ret >= 0, @@ -96,11 +96,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { } void FilterGraph::add_audio_sink() { - add_sink(avfilter_get_by_name("abuffersink")); + add_sink(FFMPEG avfilter_get_by_name("abuffersink")); } void FilterGraph::add_video_sink() { - add_sink(avfilter_get_by_name("buffersink")); + add_sink(FFMPEG avfilter_get_by_name("buffersink")); } void FilterGraph::add_sink(const AVFilter* buffersink) { @@ -114,7 +114,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) { // According to the other example // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html // `abuffersink` should not take options, and this resolved issue. - int ret = avfilter_graph_create_filter( + int ret = FFMPEG avfilter_graph_create_filter( &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph); TORCH_CHECK(ret >= 0, "Failed to create output filter."); } @@ -131,15 +131,15 @@ class InOuts { public: InOuts(const char* name, AVFilterContext* pCtx) { - p = avfilter_inout_alloc(); + p = FFMPEG avfilter_inout_alloc(); TORCH_CHECK(p, "Failed to allocate AVFilterInOut."); - p->name = av_strdup(name); + p->name = FFMPEG av_strdup(name); p->filter_ctx = pCtx; p->pad_idx = 0; p->next = nullptr; } ~InOuts() { - avfilter_inout_free(&p); + FFMPEG avfilter_inout_free(&p); } operator AVFilterInOut**() { return &p; @@ -156,7 +156,7 @@ void FilterGraph::add_process(const std::string& filter_description) { // If you are debugging this part of the code, you might get confused. InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx}; - int ret = avfilter_graph_parse_ptr( + int ret = FFMPEG avfilter_graph_parse_ptr( graph, filter_description.c_str(), out, in, nullptr); TORCH_CHECK( @@ -167,11 +167,11 @@ void FilterGraph::add_process(const std::string& filter_description) { void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) { buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx; - int ret = avfilter_graph_config(graph, nullptr); + int ret = FFMPEG avfilter_graph_config(graph, nullptr); TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret)); - // char* desc = avfilter_graph_dump(graph, NULL); + // char* desc = FFMPEG avfilter_graph_dump(graph, NULL); // std::cerr << "Filter created:\n" << desc << std::endl; - // av_free(static_cast(desc)); + // FFMPEG av_free(static_cast(desc)); } ////////////////////////////////////////////////////////////////////////////// @@ -191,7 +191,8 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const { ret.num_channels = l->ch_layout.nb_channels; #else // Before FFmpeg 5.1 - ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout); + ret.num_channels = + FFMPEG av_get_channel_layout_nb_channels(l->channel_layout); #endif break; } @@ -214,13 +215,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const { // Streaming process ////////////////////////////////////////////////////////////////////////////// int FilterGraph::add_frame(AVFrame* pInputFrame) { - return av_buffersrc_add_frame_flags( + return FFMPEG av_buffersrc_add_frame_flags( buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF); } int FilterGraph::get_frame(AVFrame* pOutputFrame) { - return av_buffersink_get_frame(buffersink_ctx, pOutputFrame); + return FFMPEG av_buffersink_get_frame(buffersink_ctx, pOutputFrame); } -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/hw_context.cpp b/torchaudio/csrc/ffmpeg/hw_context.cpp index a1d7f3c7a04..5c84f3dd09d 100644 --- a/torchaudio/csrc/ffmpeg/hw_context.cpp +++ b/torchaudio/csrc/ffmpeg/hw_context.cpp @@ -1,4 +1,5 @@ #include +#include namespace torchaudio::io { namespace { @@ -15,7 +16,7 @@ AVBufferRef* get_cuda_context(int index) { } if (CUDA_CONTEXT_CACHE.count(index) == 0) { AVBufferRef* p = nullptr; - int ret = av_hwdevice_ctx_create( + int ret = FFMPEG av_hwdevice_ctx_create( &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0); TORCH_CHECK( ret >= 0, diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp index 7ccc7bd0bf9..fe1d2d2e809 100644 --- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp +++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp @@ -1,24 +1,24 @@ #include #include +#include #include #include -namespace torchaudio { -namespace io { +namespace torchaudio::io { namespace { std::map> get_versions() { std::map> ret; -#define add_version(NAME) \ - { \ - int ver = NAME##_version(); \ - ret.emplace( \ - "lib" #NAME, \ - std::make_tuple<>( \ - AV_VERSION_MAJOR(ver), \ - AV_VERSION_MINOR(ver), \ - AV_VERSION_MICRO(ver))); \ +#define add_version(NAME) \ + { \ + int ver = FFMPEG NAME##_version(); \ + ret.emplace( \ + "lib" #NAME, \ + std::make_tuple<>( \ + AV_VERSION_MAJOR(ver), \ + AV_VERSION_MINOR(ver), \ + AV_VERSION_MICRO(ver))); \ } add_version(avutil); @@ -35,7 +35,7 @@ std::map get_demuxers(bool req_device) { std::map ret; const AVInputFormat* fmt = nullptr; void* i = nullptr; - while ((fmt = av_demuxer_iterate(&i))) { + while ((fmt = FFMPEG av_demuxer_iterate(&i))) { assert(fmt); bool is_device = [&]() { const AVClass* avclass = fmt->priv_class; @@ -52,7 +52,7 @@ std::map get_muxers(bool req_device) { std::map ret; const AVOutputFormat* fmt = nullptr; void* i = nullptr; - while ((fmt = av_muxer_iterate(&i))) { + while ((fmt = FFMPEG av_muxer_iterate(&i))) { assert(fmt); bool is_device = [&]() { const AVClass* avclass = fmt->priv_class; @@ -71,10 +71,10 @@ std::map get_codecs( const AVCodec* c = nullptr; void* i = nullptr; std::map ret; - while ((c = av_codec_iterate(&i))) { + while ((c = FFMPEG av_codec_iterate(&i))) { assert(c); - if ((req_encoder && av_codec_is_encoder(c)) || - (!req_encoder && av_codec_is_decoder(c))) { + if ((req_encoder && FFMPEG av_codec_is_encoder(c)) || + (!req_encoder && FFMPEG av_codec_is_decoder(c))) { if (c->type == type && c->name) { ret.emplace(c->name, c->long_name ? c->long_name : ""); } @@ -87,7 +87,7 @@ std::vector get_protocols(bool output) { void* opaque = nullptr; const char* name = nullptr; std::vector ret; - while ((name = avio_enum_protocols(&opaque, output))) { + while ((name = FFMPEG avio_enum_protocols(&opaque, output))) { assert(name); ret.emplace_back(name); } @@ -95,7 +95,7 @@ std::vector get_protocols(bool output) { } std::string get_build_config() { - return avcodec_configuration(); + return FFMPEG avcodec_configuration(); } ////////////////////////////////////////////////////////////////////////////// @@ -188,9 +188,9 @@ struct StreamWriterFileObj : private FileObj, public StreamWriterCustomIO { }; PYBIND11_MODULE(_torchaudio_ffmpeg, m) { - m.def("init", []() { avdevice_register_all(); }); - m.def("get_log_level", []() { return av_log_get_level(); }); - m.def("set_log_level", [](int level) { av_log_set_level(level); }); + m.def("init", []() { FFMPEG avdevice_register_all(); }); + m.def("get_log_level", []() { return FFMPEG av_log_get_level(); }); + m.def("set_log_level", [](int level) { FFMPEG av_log_set_level(level); }); m.def("get_versions", &get_versions); m.def("get_muxers", []() { return get_muxers(false); }); m.def("get_demuxers", []() { return get_demuxers(false); }); @@ -246,21 +246,22 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { .def_property_readonly( "media_type", [](const OutputStreamInfo& o) -> std::string { - return av_get_media_type_string(o.media_type); + return FFMPEG av_get_media_type_string(o.media_type); }) .def_property_readonly( "format", [](const OutputStreamInfo& o) -> std::string { switch (o.media_type) { case AVMEDIA_TYPE_AUDIO: - return av_get_sample_fmt_name((AVSampleFormat)(o.format)); + return FFMPEG av_get_sample_fmt_name( + (AVSampleFormat)(o.format)); case AVMEDIA_TYPE_VIDEO: - return av_get_pix_fmt_name((AVPixelFormat)(o.format)); + return FFMPEG av_get_pix_fmt_name((AVPixelFormat)(o.format)); default: TORCH_INTERNAL_ASSERT( false, "FilterGraph is returning unexpected media type: ", - av_get_media_type_string(o.media_type)); + FFMPEG av_get_media_type_string(o.media_type)); } }) .def_readonly("sample_rate", &OutputStreamInfo::sample_rate) @@ -284,7 +285,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { .def_property_readonly( "media_type", [](const SrcStreamInfo& s) { - return av_get_media_type_string(s.media_type); + return FFMPEG av_get_media_type_string(s.media_type); }) .def_readonly("codec_name", &SrcStreamInfo::codec_name) .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name) @@ -354,5 +355,4 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { } } // namespace -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp index 99e33e8367f..d0efae37da4 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp @@ -1,4 +1,5 @@ #include +#include #include #ifdef USE_CUDA @@ -429,11 +430,11 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_NV12 == sw_fmt, "Expected NV12 format. Found: ", - av_get_pix_fmt_name(sw_fmt)); + FFMPEG av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( @@ -506,11 +507,11 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_P010 == sw_fmt, "Expected P010 format. Found: ", - av_get_pix_fmt_name(sw_fmt)); + FFMPEG av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( @@ -581,11 +582,11 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_YUV444P == sw_fmt, "Expected YUV444P format. Found: ", - av_get_pix_fmt_name(sw_fmt)); + FFMPEG av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly for (int i = 0; i < num_channels; ++i) { diff --git a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp index 8caec7cb582..91a60f54eed 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp @@ -1,10 +1,11 @@ +#include #include -namespace torchaudio { -namespace io { +namespace torchaudio::io { + void PacketBuffer::push_packet(AVPacket* packet) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null."); - AVPacket* p = av_packet_clone(packet); + AVPacket* p = FFMPEG av_packet_clone(packet); TORCH_INTERNAL_ASSERT(p, "Failed to clone packet."); packets.emplace_back(p); } @@ -18,5 +19,4 @@ std::vector PacketBuffer::pop_packets() { bool PacketBuffer::has_packets() { return packets.size() > 0; } -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp index 147d0bc2d57..f938537f2bc 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -48,7 +49,7 @@ FilterGraphFactory get_video_factory( f.add_video_sink(); f.add_process(filter_desc); if (hw_frames_ctx) { - f.create_filter(av_buffer_ref(hw_frames_ctx)); + f.create_filter(FFMPEG av_buffer_ref(hw_frames_ctx)); } else { f.create_filter(); } @@ -139,7 +140,7 @@ struct ProcessImpl : public IPostDecodeProcess { if (ret >= 0) { buffer.push_frame(converter.convert(frame), frame->pts); } - av_frame_unref(frame); + FFMPEG av_frame_unref(frame); } return ret; } @@ -159,7 +160,7 @@ std::unique_ptr get_unchunked_audio_process( TORCH_INTERNAL_ASSERT( i.type == AVMEDIA_TYPE_AUDIO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + FFMPEG av_get_media_type_string(i.type)); using B = UnchunkedBuffer; @@ -226,7 +227,7 @@ std::unique_ptr get_unchunked_audio_process( } default: TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); + false, "Unexpected audio type:", FFMPEG av_get_sample_fmt_name(fmt)); } } @@ -239,7 +240,7 @@ std::unique_ptr get_chunked_audio_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_AUDIO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + FFMPEG av_get_media_type_string(i.type)); using B = ChunkedBuffer; B buffer{i.time_base, frames_per_chunk, num_chunks}; @@ -307,7 +308,7 @@ std::unique_ptr get_chunked_audio_process( } default: TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); + false, "Unexpected audio type:", FFMPEG av_get_sample_fmt_name(fmt)); } } @@ -321,7 +322,7 @@ std::unique_ptr get_unchunked_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + FFMPEG av_get_media_type_string(i.type)); auto h = i.height; auto w = i.width; @@ -375,7 +376,9 @@ std::unique_ptr get_unchunked_video_process( } default: { TORCH_INTERNAL_ASSERT( - false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); + false, + "Unexpected video format found: ", + FFMPEG av_get_pix_fmt_name(fmt)); } } } @@ -393,7 +396,7 @@ std::unique_ptr get_unchunked_cuda_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + FFMPEG av_get_media_type_string(i.type)); using B = UnchunkedBuffer; switch (auto fmt = (AVPixelFormat)i.format; fmt) { @@ -416,13 +419,13 @@ std::unique_ptr get_unchunked_cuda_video_process( TORCH_CHECK( false, "Unsupported video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); } default: { TORCH_CHECK( false, "Unexpected video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); } } #endif @@ -437,7 +440,7 @@ std::unique_ptr get_chunked_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + FFMPEG av_get_media_type_string(i.type)); auto h = i.height; auto w = i.width; @@ -491,7 +494,9 @@ std::unique_ptr get_chunked_video_process( } default: { TORCH_INTERNAL_ASSERT( - false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); + false, + "Unexpected video format found: ", + FFMPEG av_get_pix_fmt_name(fmt)); } } } @@ -511,7 +516,7 @@ std::unique_ptr get_chunked_cuda_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - av_get_media_type_string(i.type)); + FFMPEG av_get_media_type_string(i.type)); using B = ChunkedBuffer; switch (auto fmt = (AVPixelFormat)i.format; fmt) { @@ -540,13 +545,13 @@ std::unique_ptr get_chunked_cuda_video_process( TORCH_CHECK( false, "Unsupported video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); } default: { TORCH_CHECK( false, "Unexpected video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); + FFMPEG av_get_pix_fmt_name(fmt)); } } #endif diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp index a0bf22a0650..d7d5b40cbd1 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp @@ -1,11 +1,10 @@ #include +#include #include #include #include -namespace torchaudio { -namespace io { - +namespace torchaudio::io { namespace { AVCodecContextPtr alloc_codec_context( enum AVCodecID codec_id, @@ -13,24 +12,24 @@ AVCodecContextPtr alloc_codec_context( const AVCodec* codec = [&]() { if (decoder_name) { const AVCodec* c = - avcodec_find_decoder_by_name(decoder_name.value().c_str()); + FFMPEG avcodec_find_decoder_by_name(decoder_name.value().c_str()); TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value()); return c; } else { - const AVCodec* c = avcodec_find_decoder(codec_id); - TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id)); + const AVCodec* c = FFMPEG avcodec_find_decoder(codec_id); + TORCH_CHECK(c, "Unsupported codec: ", FFMPEG avcodec_get_name(codec_id)); return c; } }(); - AVCodecContext* codec_ctx = avcodec_alloc_context3(codec); + AVCodecContext* codec_ctx = FFMPEG avcodec_alloc_context3(codec); TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext."); return AVCodecContextPtr(codec_ctx); } const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) { for (int i = 0;; ++i) { - const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i); + const AVCodecHWConfig* config = FFMPEG avcodec_get_hw_config(codec, i); if (!config) { break; } @@ -83,7 +82,7 @@ enum AVPixelFormat get_hw_format( } AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { - AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); + AVBufferRef* p = FFMPEG av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); TORCH_CHECK( p, "Failed to allocate CUDA frame context from device context at ", @@ -94,11 +93,11 @@ AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { frames_ctx->width = codec_ctx->width; frames_ctx->height = codec_ctx->height; frames_ctx->initial_pool_size = 5; - int ret = av_hwframe_ctx_init(p); + int ret = FFMPEG av_hwframe_ctx_init(p); if (ret >= 0) { return p; } - av_buffer_unref(&p); + FFMPEG av_buffer_unref(&p); TORCH_CHECK( false, "Failed to initialize CUDA frame context: ", av_err2string(ret)); } @@ -107,7 +106,7 @@ void configure_codec_context( AVCodecContext* codec_ctx, const AVCodecParameters* params, const torch::Device& device) { - int ret = avcodec_parameters_to_context(codec_ctx, params); + int ret = FFMPEG avcodec_parameters_to_context(codec_ctx, params); TORCH_CHECK( ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret)); @@ -122,7 +121,8 @@ void configure_codec_context( // 2. Set pCodecContext->get_format call back function which // will retrieve the HW pixel format from opaque pointer. codec_ctx->get_format = get_hw_format; - codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); + codec_ctx->hw_device_ctx = + FFMPEG av_buffer_ref(get_cuda_context(device.index())); TORCH_INTERNAL_ASSERT( codec_ctx->hw_device_ctx, "Failed to reference HW device context."); #endif @@ -135,16 +135,16 @@ void open_codec( AVDictionary* opts = get_option_dict(decoder_option); // Default to single thread execution. - if (!av_dict_get(opts, "threads", nullptr, 0)) { - av_dict_set(&opts, "threads", "1", 0); + if (!FFMPEG av_dict_get(opts, "threads", nullptr, 0)) { + FFMPEG av_dict_set(&opts, "threads", "1", 0); } if (!codec_ctx->channel_layout) { codec_ctx->channel_layout = - av_get_default_channel_layout(codec_ctx->channels); + FFMPEG av_get_default_channel_layout(codec_ctx->channels); } - int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts); + int ret = FFMPEG avcodec_open2(codec_ctx, codec_ctx->codec, &opts); clean_up_dict(opts); TORCH_CHECK( ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret)); @@ -259,8 +259,8 @@ void StreamProcessor::remove_stream(KeyType key) { void StreamProcessor::set_discard_timestamp(int64_t timestamp) { TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative."); - discard_before_pts = - av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base); + discard_before_pts = FFMPEG av_rescale_q( + timestamp, FFMPEG av_get_time_base_q(), stream_time_base); } void StreamProcessor::set_decoder( @@ -306,9 +306,9 @@ int StreamProcessor::process_packet(AVPacket* packet) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( is_decoder_set(), "Decoder must have been set prior to calling this function."); - int ret = avcodec_send_packet(codec_ctx, packet); + int ret = FFMPEG avcodec_send_packet(codec_ctx, packet); while (ret >= 0) { - ret = avcodec_receive_frame(codec_ctx, frame); + ret = FFMPEG avcodec_receive_frame(codec_ctx, frame); // AVERROR(EAGAIN) means that new input data is required to return new // output. if (ret == AVERROR(EAGAIN)) @@ -355,7 +355,7 @@ int StreamProcessor::process_packet(AVPacket* packet) { } // else we can just unref the frame and continue - av_frame_unref(frame); + FFMPEG av_frame_unref(frame); } return ret; } @@ -364,7 +364,7 @@ void StreamProcessor::flush() { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( is_decoder_set(), "Decoder must have been set prior to calling this function."); - avcodec_flush_buffers(codec_ctx); + FFMPEG avcodec_flush_buffers(codec_ctx); for (auto& ite : post_processes) { ite.second->flush(); } @@ -389,5 +389,4 @@ c10::optional StreamProcessor::pop_chunk(KeyType key) { return post_processes.at(key)->pop_chunk(); } -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp index 0eec327aa51..086867af17a 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp @@ -1,12 +1,16 @@ #include +#include #include #include #include #include #include -namespace torchaudio { -namespace io { +extern "C" { +#include +} + +namespace torchaudio::io { using KeyType = StreamProcessor::KeyType; @@ -19,7 +23,7 @@ AVFormatContext* get_input_format_context( const c10::optional& format, const c10::optional& option, AVIOContext* io_ctx) { - AVFormatContext* p = avformat_alloc_context(); + AVFormatContext* p = FFMPEG avformat_alloc_context(); TORCH_CHECK(p, "Failed to allocate AVFormatContext."); if (io_ctx) { p->pb = io_ctx; @@ -29,7 +33,7 @@ AVFormatContext* get_input_format_context( if (format.has_value()) { std::string format_str = format.value(); AVFORMAT_CONST AVInputFormat* pInput = - av_find_input_format(format_str.c_str()); + FFMPEG av_find_input_format(format_str.c_str()); TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\""); return pInput; } @@ -37,7 +41,7 @@ AVFormatContext* get_input_format_context( }(); AVDictionary* opt = get_option_dict(option); - int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt); + int ret = FFMPEG avformat_open_input(&p, src.c_str(), pInputFormat, &opt); clean_up_dict(opt); TORCH_CHECK( @@ -53,7 +57,7 @@ AVFormatContext* get_input_format_context( StreamReader::StreamReader(AVFormatContext* p) : format_ctx(p) { C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamReader"); - int ret = avformat_find_stream_info(format_ctx, nullptr); + int ret = FFMPEG avformat_find_stream_info(format_ctx, nullptr); TORCH_CHECK( ret >= 0, "Failed to find stream information: ", av_err2string(ret)); @@ -110,7 +114,7 @@ void validate_src_stream_type( "Stream ", i, " is not ", - av_get_media_type_string(type), + FFMPEG av_get_media_type_string(type), " stream."); } @@ -125,7 +129,7 @@ namespace { OptionDict parse_metadata(const AVDictionary* metadata) { AVDictionaryEntry* tag = nullptr; OptionDict ret; - while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { + while ((tag = FFMPEG av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { ret.emplace(std::string(tag->key), std::string(tag->value)); } return ret; @@ -148,7 +152,8 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { ret.num_frames = stream->nb_frames; ret.bits_per_sample = codecpar->bits_per_raw_sample; ret.metadata = parse_metadata(stream->metadata); - const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id); + const AVCodecDescriptor* desc = + FFMPEG avcodec_descriptor_get(codecpar->codec_id); if (desc) { ret.codec_name = desc->name; ret.codec_long_name = desc->long_name; @@ -158,7 +163,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { case AVMEDIA_TYPE_AUDIO: { AVSampleFormat smp_fmt = static_cast(codecpar->format); if (smp_fmt != AV_SAMPLE_FMT_NONE) { - ret.fmt_name = av_get_sample_fmt_name(smp_fmt); + ret.fmt_name = FFMPEG av_get_sample_fmt_name(smp_fmt); } ret.sample_rate = static_cast(codecpar->sample_rate); ret.num_channels = codecpar->channels; @@ -167,7 +172,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { case AVMEDIA_TYPE_VIDEO: { AVPixelFormat pix_fmt = static_cast(codecpar->format); if (pix_fmt != AV_PIX_FMT_NONE) { - ret.fmt_name = av_get_pix_fmt_name(pix_fmt); + ret.fmt_name = FFMPEG av_get_pix_fmt_name(pix_fmt); } ret.width = codecpar->width; ret.height = codecpar->height; @@ -181,7 +186,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { namespace { AVCodecParameters* get_codecpar() { - AVCodecParameters* ptr = avcodec_parameters_alloc(); + AVCodecParameters* ptr = FFMPEG avcodec_parameters_alloc(); TORCH_CHECK(ptr, "Failed to allocate resource."); return ptr; } @@ -192,7 +197,7 @@ StreamParams StreamReader::get_src_stream_params(int i) { AVStream* stream = format_ctx->streams[i]; AVCodecParametersPtr codec_params(get_codecpar()); - int ret = avcodec_parameters_copy(codec_params, stream->codecpar); + int ret = FFMPEG avcodec_parameters_copy(codec_params, stream->codecpar); TORCH_CHECK( ret >= 0, "Failed to copy the stream's codec parameters. (", @@ -234,12 +239,12 @@ OutputStreamInfo StreamReader::get_out_stream_info(int i) const { } int64_t StreamReader::find_best_audio_stream() const { - return av_find_best_stream( + return FFMPEG av_find_best_stream( format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0); } int64_t StreamReader::find_best_video_stream() const { - return av_find_best_stream( + return FFMPEG av_find_best_stream( format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); } @@ -289,7 +294,7 @@ void StreamReader::seek(double timestamp_s, int64_t mode) { TORCH_CHECK(false, "Invalid mode value: ", mode); } - int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); + int ret = FFMPEG av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); if (ret < 0) { seek_timestamp = 0; @@ -402,12 +407,12 @@ void StreamReader::add_stream( case AVMEDIA_TYPE_AUDIO: return AVRational{0, 1}; case AVMEDIA_TYPE_VIDEO: - return av_guess_frame_rate(format_ctx, stream, nullptr); + return FFMPEG av_guess_frame_rate(format_ctx, stream, nullptr); default: TORCH_INTERNAL_ASSERT( false, "Unexpected media type is given: ", - av_get_media_type_string(media_type)); + FFMPEG av_get_media_type_string(media_type)); } }(); int key = processors[i]->add_stream( @@ -446,7 +451,7 @@ void StreamReader::remove_stream(int64_t i) { // 1: It's done, caller should stop calling // <0: Some error happened int StreamReader::process_packet() { - int ret = av_read_frame(format_ctx, packet); + int ret = FFMPEG av_read_frame(format_ctx, packet); if (ret == AVERROR_EOF) { ret = drain(); return (ret < 0) ? ret : 1; @@ -577,12 +582,13 @@ AVIOContext* get_io_context( int buffer_size, int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = static_cast(av_malloc(buffer_size)); + unsigned char* buffer = + static_cast(FFMPEG av_malloc(buffer_size)); TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = avio_alloc_context( + AVIOContext* io_ctx = FFMPEG avio_alloc_context( buffer, buffer_size, 0, opaque, read_packet, nullptr, seek); if (!io_ctx) { - av_freep(&buffer); + FFMPEG av_freep(&buffer); TORCH_CHECK(false, "Failed to allocate AVIOContext."); } return io_ctx; @@ -607,5 +613,4 @@ StreamReaderCustomIO::StreamReaderCustomIO( : CustomInput(opaque, buffer_size, read_packet, seek), StreamReader(io_ctx, format, option) {} -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp index a0e18fb8d41..0afd1414f6a 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp @@ -1,7 +1,12 @@ #include +#include #include #include +extern "C" { +#include +} + namespace torchaudio::io { //////////////////////////////////////////////////////////////////////////////// @@ -56,7 +61,7 @@ void EncodeProcess::process_frame(AVFrame* src) { if (ret >= 0) { encoder.encode(dst_frame); } - av_frame_unref(dst_frame); + FFMPEG av_frame_unref(dst_frame); } } @@ -71,8 +76,8 @@ void EncodeProcess::flush() { namespace { enum AVSampleFormat get_src_sample_fmt(const std::string& src) { - auto fmt = av_get_sample_fmt(src.c_str()); - if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) { + auto fmt = FFMPEG av_get_sample_fmt(src.c_str()); + if (fmt != AV_SAMPLE_FMT_NONE && !FFMPEG av_sample_fmt_is_planar(fmt)) { return fmt; } TORCH_CHECK( @@ -89,7 +94,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) { AV_SAMPLE_FMT_S64, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_DBL}) { - ret.emplace_back(av_get_sample_fmt_name(fmt)); + ret.emplace_back(FFMPEG av_get_sample_fmt_name(fmt)); } return c10::Join(", ", ret); }(), @@ -97,7 +102,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) { } enum AVPixelFormat get_src_pix_fmt(const std::string& src) { - AVPixelFormat fmt = av_get_pix_fmt(src.c_str()); + AVPixelFormat fmt = FFMPEG av_get_pix_fmt(src.c_str()); switch (fmt) { case AV_PIX_FMT_GRAY8: case AV_PIX_FMT_RGB24: @@ -118,7 +123,7 @@ enum AVPixelFormat get_src_pix_fmt(const std::string& src) { AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, AV_PIX_FMT_YUV444P}) { - ret.emplace_back(av_get_pix_fmt_name(fmt)); + ret.emplace_back(FFMPEG av_get_pix_fmt_name(fmt)); } return c10::Join(", ", ret); }(), @@ -132,18 +137,21 @@ const AVCodec* get_codec( AVCodecID default_codec, const c10::optional& encoder) { if (encoder) { - const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str()); + const AVCodec* c = + FFMPEG avcodec_find_encoder_by_name(encoder.value().c_str()); TORCH_CHECK(c, "Unexpected codec: ", encoder.value()); return c; } - const AVCodec* c = avcodec_find_encoder(default_codec); + const AVCodec* c = FFMPEG avcodec_find_encoder(default_codec); TORCH_CHECK( - c, "Encoder not found for codec: ", avcodec_get_name(default_codec)); + c, + "Encoder not found for codec: ", + FFMPEG avcodec_get_name(default_codec)); return c; } AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) { - AVCodecContext* ctx = avcodec_alloc_context3(codec); + AVCodecContext* ctx = FFMPEG avcodec_alloc_context3(codec); TORCH_CHECK(ctx, "Failed to allocate CodecContext."); if (flags & AVFMT_GLOBALHEADER) { @@ -169,25 +177,25 @@ void open_codec( // while "libopus" refers to the one depends on libopusenc // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251 if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) { - if (!av_dict_get(opt, "strict", nullptr, 0)) { + if (!FFMPEG av_dict_get(opt, "strict", nullptr, 0)) { TORCH_WARN_ONCE( "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ", "If this is not desired, please provide \"strict\" encoder option ", "with desired value."); - av_dict_set(&opt, "strict", "experimental", 0); + FFMPEG av_dict_set(&opt, "strict", "experimental", 0); } } if (std::strcmp(codec_ctx->codec->name, "opus") == 0) { - if (!av_dict_get(opt, "strict", nullptr, 0)) { + if (!FFMPEG av_dict_get(opt, "strict", nullptr, 0)) { TORCH_WARN_ONCE( "\"opus\" encoder is selected. Enabling '-strict experimental'. ", "If this is not desired, please provide \"strict\" encoder option ", "with desired value."); - av_dict_set(&opt, "strict", "experimental", 0); + FFMPEG av_dict_set(&opt, "strict", "experimental", 0); } } - int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt); + int ret = FFMPEG avcodec_open2(codec_ctx, codec_ctx->codec, &opt); clean_up_dict(opt); TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")"); } @@ -214,7 +222,7 @@ bool supported_sample_fmt( std::string get_supported_formats(const AVSampleFormat* sample_fmts) { std::vector ret; while (*sample_fmts != AV_SAMPLE_FMT_NONE) { - ret.emplace_back(av_get_sample_fmt_name(*sample_fmts)); + ret.emplace_back(FFMPEG av_get_sample_fmt_name(*sample_fmts)); ++sample_fmts; } return c10::Join(", ", ret); @@ -226,7 +234,7 @@ AVSampleFormat get_enc_fmt( const AVCodec* codec) { if (encoder_format) { auto& enc_fmt_val = encoder_format.value(); - auto fmt = av_get_sample_fmt(enc_fmt_val.c_str()); + auto fmt = FFMPEG av_get_sample_fmt(enc_fmt_val.c_str()); TORCH_CHECK( fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val); TORCH_CHECK( @@ -313,8 +321,8 @@ std::string get_supported_channels(const uint64_t* channel_layouts) { std::vector names; while (*channel_layouts) { std::stringstream ss; - ss << av_get_channel_layout_nb_channels(*channel_layouts); - ss << " (" << av_get_channel_name(*channel_layouts) << ")"; + ss << FFMPEG av_get_channel_layout_nb_channels(*channel_layouts); + ss << " (" << FFMPEG av_get_channel_name(*channel_layouts) << ")"; names.emplace_back(ss.str()); ++channel_layouts; } @@ -331,10 +339,10 @@ uint64_t get_channel_layout( TORCH_CHECK( val > 0, "The number of channels must be greater than 0. Found: ", val); if (!codec->channel_layouts) { - return static_cast(av_get_default_channel_layout(val)); + return static_cast(FFMPEG av_get_default_channel_layout(val)); } for (const uint64_t* it = codec->channel_layouts; *it; ++it) { - if (av_get_channel_layout_nb_channels(*it) == val) { + if (FFMPEG av_get_channel_layout_nb_channels(*it) == val) { return *it; } } @@ -371,8 +379,9 @@ void configure_audio_codec_ctx( const c10::optional& codec_config) { codec_ctx->sample_fmt = format; codec_ctx->sample_rate = sample_rate; - codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24)); - codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout); + codec_ctx->time_base = av_inv_q(FFMPEG av_d2q(sample_rate, 1 << 24)); + codec_ctx->channels = + FFMPEG av_get_channel_layout_nb_channels(channel_layout); codec_ctx->channel_layout = channel_layout; // Set optional stuff @@ -411,7 +420,7 @@ bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) { std::string get_supported_formats(const AVPixelFormat* pix_fmts) { std::vector ret; while (*pix_fmts != AV_PIX_FMT_NONE) { - ret.emplace_back(av_get_pix_fmt_name(*pix_fmts)); + ret.emplace_back(FFMPEG av_get_pix_fmt_name(*pix_fmts)); ++pix_fmts; } return c10::Join(", ", ret); @@ -423,7 +432,7 @@ AVPixelFormat get_enc_fmt( const AVCodec* codec) { if (encoder_format) { const auto& val = encoder_format.value(); - auto fmt = av_get_pix_fmt(val.c_str()); + auto fmt = FFMPEG av_get_pix_fmt(val.c_str()); TORCH_CHECK( supported_pix_fmt(fmt, codec->pix_fmts), codec->name, @@ -461,7 +470,7 @@ AVRational get_enc_rate( std::isfinite(enc_rate) && enc_rate > 0, "Encoder sample rate must be positive and fininte. Found: ", enc_rate); - AVRational rate = av_d2q(enc_rate, 1 << 24); + AVRational rate = FFMPEG av_d2q(enc_rate, 1 << 24); TORCH_CHECK( supported_frame_rate(rate, codec->supported_framerates), codec->name, @@ -545,14 +554,14 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { // context to AVCodecContext. But this way, it will be deallocated // automatically at the time AVCodecContext is freed, so we do that. - ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); + ctx->hw_device_ctx = FFMPEG av_buffer_ref(get_cuda_context(device.index())); TORCH_INTERNAL_ASSERT( ctx->hw_device_ctx, "Failed to reference HW device context."); ctx->sw_pix_fmt = ctx->pix_fmt; ctx->pix_fmt = AV_PIX_FMT_CUDA; - ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx); + ctx->hw_frames_ctx = FFMPEG av_hwframe_ctx_alloc(ctx->hw_device_ctx); TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context."); auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data); @@ -562,7 +571,7 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { frames_ctx->height = ctx->height; frames_ctx->initial_pool_size = 5; - int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx); + int ret = FFMPEG av_hwframe_ctx_init(ctx->hw_frames_ctx); TORCH_CHECK( ret >= 0, "Failed to initialize CUDA frame context: ", @@ -574,11 +583,11 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { //////////////////////////////////////////////////////////////////////////////// AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) { - AVStream* stream = avformat_new_stream(format_ctx, nullptr); + AVStream* stream = FFMPEG avformat_new_stream(format_ctx, nullptr); TORCH_CHECK(stream, "Failed to allocate stream."); stream->time_base = codec_ctx->time_base; - int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx); + int ret = FFMPEG avcodec_parameters_from_context(stream->codecpar, codec_ctx); TORCH_CHECK( ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret)); return stream; @@ -605,7 +614,7 @@ FilterGraph get_audio_filter_graph( if (filter_desc || src_fmt != enc_fmt || src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) { std::stringstream ss; - ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt) + ss << "aformat=sample_fmts=" << FFMPEG av_get_sample_fmt_name(enc_fmt) << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x" << std::hex << enc_ch_layout; parts.push_back(ss.str()); @@ -656,7 +665,7 @@ FilterGraph get_video_filter_graph( } if (filter_desc || src_fmt != enc_fmt) { std::stringstream ss; - ss << "format=" << av_get_pix_fmt_name(enc_fmt); + ss << "format=" << FFMPEG av_get_pix_fmt_name(enc_fmt); parts.emplace_back(ss.str()); } if (filter_desc || @@ -695,7 +704,7 @@ AVFramePtr get_audio_frame( frame->channel_layout = channel_layout; frame->sample_rate = sample_rate; frame->nb_samples = nb_samples; - int ret = av_frame_get_buffer(frame, 0); + int ret = FFMPEG av_frame_get_buffer(frame, 0); TORCH_CHECK( ret >= 0, "Error allocating the source audio frame:", av_err2string(ret)); @@ -711,7 +720,7 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) { frame->format = src_fmt; frame->width = width; frame->height = height; - int ret = av_frame_get_buffer(frame, 0); + int ret = FFMPEG av_frame_get_buffer(frame, 0); TORCH_CHECK( ret >= 0, "Error allocating a video buffer :", av_err2string(ret)); @@ -756,10 +765,10 @@ EncodeProcess get_audio_encode_process( // case, restrictions on the format to support tensor inputs do not apply, and // so we directly get the format via FFmpeg. const AVSampleFormat src_fmt = (disable_converter) - ? av_get_sample_fmt(format.c_str()) + ? FFMPEG av_get_sample_fmt(format.c_str()) : get_src_sample_fmt(format); - const auto src_ch_layout = - static_cast(av_get_default_channel_layout(src_num_channels)); + const auto src_ch_layout = static_cast( + FFMPEG av_get_default_channel_layout(src_num_channels)); // 2. Fetch codec from default or override TORCH_CHECK( @@ -779,7 +788,7 @@ EncodeProcess get_audio_encode_process( // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277 // This is the case for at least until FFmpeg 6.0, so it will be // like this for a while. - return static_cast(av_get_default_channel_layout(2)); + return static_cast(FFMPEG av_get_default_channel_layout(2)); } return get_channel_layout(src_ch_layout, encoder_num_channels, codec); }(); @@ -867,9 +876,9 @@ EncodeProcess get_video_encode_process( // case, restrictions on the format to support tensor inputs do not apply, and // so we directly get the format via FFmpeg. const AVPixelFormat src_fmt = (disable_converter) - ? av_get_pix_fmt(format.c_str()) + ? FFMPEG av_get_pix_fmt(format.c_str()) : get_src_pix_fmt(format); - const AVRational src_rate = av_d2q(frame_rate, 1 << 24); + const AVRational src_rate = FFMPEG av_d2q(frame_rate, 1 << 24); // 2. Fetch codec from default or override TORCH_CHECK( @@ -936,7 +945,8 @@ EncodeProcess get_video_encode_process( AVFramePtr src_frame = [&]() { if (codec_ctx->hw_frames_ctx) { AVFramePtr frame{alloc_avframe()}; - int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); + int ret = + FFMPEG av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret)); frame->nb_samples = 1; frame->pts = 0; diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp index 3d2e5015357..59010a18d42 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp @@ -1,3 +1,4 @@ +#include #include namespace torchaudio::io { @@ -13,10 +14,10 @@ Encoder::Encoder( /// /// @param frame Frame data to encode void Encoder::encode(AVFrame* frame) { - int ret = avcodec_send_frame(codec_ctx, frame); + int ret = FFMPEG avcodec_send_frame(codec_ctx, frame); TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ")."); while (ret >= 0) { - ret = avcodec_receive_packet(codec_ctx, packet); + ret = FFMPEG avcodec_receive_packet(codec_ctx, packet); if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { if (ret == AVERROR_EOF) { // Note: @@ -31,7 +32,7 @@ void Encoder::encode(AVFrame* frame) { // An alternative is to use `av_write_frame` functoin, but in that case // client code is responsible for ordering packets, which makes it // complicated to use StreamWriter - ret = av_interleaved_write_frame(format_ctx, nullptr); + ret = FFMPEG av_interleaved_write_frame(format_ctx, nullptr); TORCH_CHECK( ret >= 0, "Failed to flush packet (", av_err2string(ret), ")."); } @@ -51,10 +52,11 @@ void Encoder::encode(AVFrame* frame) { // This has to be set before av_packet_rescale_ts bellow. packet->duration = 1; } - av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base); + FFMPEG av_packet_rescale_ts( + packet, codec_ctx->time_base, stream->time_base); packet->stream_index = stream->index; - ret = av_interleaved_write_frame(format_ctx, packet); + ret = FFMPEG av_interleaved_write_frame(format_ctx, packet); TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ")."); } } diff --git a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp index 0701c5a5965..0bb196807e9 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp @@ -1,3 +1,4 @@ +#include #include namespace torchaudio::io { @@ -5,9 +6,9 @@ namespace { AVStream* add_stream( AVFormatContext* format_ctx, const StreamParams& stream_params) { - AVStream* stream = avformat_new_stream(format_ctx, nullptr); - int ret = - avcodec_parameters_copy(stream->codecpar, stream_params.codec_params); + AVStream* stream = FFMPEG avformat_new_stream(format_ctx, nullptr); + int ret = FFMPEG avcodec_parameters_copy( + stream->codecpar, stream_params.codec_params); TORCH_CHECK( ret >= 0, "Failed to copy the stream's codec parameters. (", @@ -26,11 +27,12 @@ PacketWriter::PacketWriter( void PacketWriter::write_packet(const AVPacketPtr& packet) { AVPacket dst_packet; - int ret = av_packet_ref(&dst_packet, packet); + int ret = FFMPEG av_packet_ref(&dst_packet, packet); TORCH_CHECK(ret >= 0, "Failed to copy packet."); - av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base); + FFMPEG av_packet_rescale_ts( + &dst_packet, original_time_base, stream->time_base); dst_packet.stream_index = stream->index; - ret = av_interleaved_write_frame(format_ctx, &dst_packet); + ret = FFMPEG av_interleaved_write_frame(format_ctx, &dst_packet); TORCH_CHECK(ret >= 0, "Failed to write packet to destination."); } } // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp index df51d92355c..17bcf2e2bcb 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp @@ -1,11 +1,11 @@ +#include #include #ifdef USE_CUDA #include #endif -namespace torchaudio { -namespace io { +namespace torchaudio::io { namespace { AVFormatContext* get_output_format_context( @@ -19,7 +19,7 @@ AVFormatContext* get_output_format_context( } AVFormatContext* p = nullptr; - int ret = avformat_alloc_output_context2( + int ret = FFMPEG avformat_alloc_output_context2( &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str()); TORCH_CHECK( ret >= 0, @@ -208,14 +208,14 @@ void StreamWriter::add_video_frame_stream( } void StreamWriter::set_metadata(const OptionDict& metadata) { - av_dict_free(&format_ctx->metadata); + FFMPEG av_dict_free(&format_ctx->metadata); for (auto const& [key, value] : metadata) { - av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); + FFMPEG av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); } } void StreamWriter::dump_format(int64_t i) { - av_dump_format(format_ctx, (int)i, format_ctx->url, 1); + FFMPEG av_dump_format(format_ctx, (int)i, format_ctx->url, 1); } void StreamWriter::open(const c10::optional& option) { @@ -231,10 +231,10 @@ void StreamWriter::open(const c10::optional& option) { AVDictionary* opt = get_option_dict(option); if (!(fmt->flags & AVFMT_NOFILE) && !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { - ret = avio_open2( + ret = FFMPEG avio_open2( &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt); if (ret < 0) { - av_dict_free(&opt); + FFMPEG av_dict_free(&opt); TORCH_CHECK( false, "Failed to open dst: ", @@ -245,7 +245,7 @@ void StreamWriter::open(const c10::optional& option) { } } - ret = avformat_write_header(format_ctx, &opt); + ret = FFMPEG avformat_write_header(format_ctx, &opt); clean_up_dict(opt); TORCH_CHECK( ret >= 0, @@ -258,7 +258,7 @@ void StreamWriter::open(const c10::optional& option) { } void StreamWriter::close() { - int ret = av_write_trailer(format_ctx); + int ret = FFMPEG av_write_trailer(format_ctx); if (ret < 0) { LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ")."; } @@ -269,7 +269,7 @@ void StreamWriter::close() { if (!(fmt->flags & AVFMT_NOFILE) && !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { // avio_closep can be only applied to AVIOContext opened by avio_open - avio_closep(&(format_ctx->pb)); + FFMPEG avio_closep(&(format_ctx->pb)); } is_open = false; } @@ -355,12 +355,13 @@ AVIOContext* get_io_context( int buffer_size, int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = static_cast(av_malloc(buffer_size)); + unsigned char* buffer = + static_cast(FFMPEG av_malloc(buffer_size)); TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = avio_alloc_context( + AVIOContext* io_ctx = FFMPEG avio_alloc_context( buffer, buffer_size, 1, opaque, nullptr, write_packet, seek); if (!io_ctx) { - av_freep(&buffer); + FFMPEG av_freep(&buffer); TORCH_CHECK(false, "Failed to allocate AVIOContext."); } return io_ctx; @@ -384,5 +385,4 @@ StreamWriterCustomIO::StreamWriterCustomIO( : CustomOutput(opaque, buffer_size, write_packet, seek), StreamWriter(io_ctx, format) {} -} // namespace io -} // namespace torchaudio +} // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp index e9350f0479a..1e3d07067f4 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp @@ -1,3 +1,4 @@ +#include #include #ifdef USE_CUDA @@ -5,7 +6,6 @@ #endif namespace torchaudio::io { - namespace { using InitFunc = TensorConverter::InitFunc; @@ -41,8 +41,8 @@ void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); + if (!FFMPEG av_frame_is_writable(buffer)) { + int ret = FFMPEG av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -145,8 +145,8 @@ void write_interlaced_video( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); + if (!FFMPEG av_frame_is_writable(buffer)) { + int ret = FFMPEG av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -187,7 +187,7 @@ void write_planar_video( AVFrame* buffer, int num_planes) { const auto num_colors = - av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; + FFMPEG av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors); @@ -195,8 +195,8 @@ void write_planar_video( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); + if (!FFMPEG av_frame_is_writable(buffer)) { + int ret = FFMPEG av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -308,7 +308,7 @@ std::pair get_video_func(AVFrame* buffer) { TORCH_CHECK( false, "Unexpected pixel format for CUDA: ", - av_get_pix_fmt_name(sw_pix_fmt)); + FFMPEG av_get_pix_fmt_name(sw_pix_fmt)); } } @@ -317,7 +317,7 @@ std::pair get_video_func(AVFrame* buffer) { case AV_PIX_FMT_GRAY8: case AV_PIX_FMT_RGB24: case AV_PIX_FMT_BGR24: { - int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components; + int channels = FFMPEG av_pix_fmt_desc_get(pix_fmt)->nb_components; InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) { validate_video_input(t, f, channels); return init_interlaced(t); @@ -339,7 +339,9 @@ std::pair get_video_func(AVFrame* buffer) { } default: TORCH_CHECK( - false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt)); + false, + "Unexpected pixel format: ", + FFMPEG av_get_pix_fmt_name(pix_fmt)); } } @@ -383,7 +385,9 @@ TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size) break; default: TORCH_INTERNAL_ASSERT( - false, "Unsupported media type: ", av_get_media_type_string(type)); + false, + "Unsupported media type: ", + FFMPEG av_get_media_type_string(type)); } } diff --git a/torchaudio/csrc/ffmpeg/stub.cpp b/torchaudio/csrc/ffmpeg/stub.cpp new file mode 100644 index 00000000000..4960b0050e3 --- /dev/null +++ b/torchaudio/csrc/ffmpeg/stub.cpp @@ -0,0 +1,196 @@ +#ifdef DLOPEN_FFMPEG + +#include +#include +#include + +extern "C" { +#include +#include +#include +#include +#include +} + +namespace torchaudio::io::detail { +namespace { +class StubImpl { + at::DynamicLibrary libavutil; + at::DynamicLibrary libavcodec; + at::DynamicLibrary libavformat; + at::DynamicLibrary libavdevice; + at::DynamicLibrary libavfilter; + + public: + // The struct that holds all the function pointers to be used. + FFmpegStub stub{}; + + StubImpl( + const char* util, + const char* codec, + const char* format, + const char* device, + const char* filter) + : libavutil(util), + libavcodec(codec), + libavformat(format), + libavdevice(device), + libavfilter(filter) { +#define set(X) stub.X = (decltype(FFmpegStub::X))libavutil.sym(#X) + set(av_buffer_ref); + set(av_buffer_unref); + set(av_d2q); + set(av_dict_free); + set(av_dict_get); + set(av_dict_set); + set(av_frame_alloc); + set(av_frame_free); + set(av_frame_get_buffer); + set(av_frame_is_writable); + set(av_frame_make_writable); + set(av_frame_unref); + set(av_freep); + set(av_get_channel_layout_nb_channels); + set(av_get_channel_name); + set(av_get_default_channel_layout); + set(av_get_media_type_string); + set(av_get_pix_fmt); + set(av_get_pix_fmt_name); + set(av_get_sample_fmt); + set(av_get_sample_fmt_name); + set(av_get_time_base_q); + set(av_hwdevice_ctx_create); + set(av_hwframe_ctx_alloc); + set(av_hwframe_ctx_init); + set(av_hwframe_get_buffer); + set(av_log_get_level); + set(av_log_set_level); + set(av_malloc); + set(av_pix_fmt_desc_get); + set(av_rescale_q); + set(av_sample_fmt_is_planar); + set(av_strdup); + set(av_strerror); + set(avutil_version); +#undef set + +#define set(X) stub.X = (decltype(FFmpegStub::X))libavcodec.sym(#X) + set(av_codec_is_decoder); + set(av_codec_is_encoder); + set(av_codec_iterate); + set(av_packet_alloc); + set(av_packet_clone); + set(av_packet_free); + set(av_packet_ref); + set(av_packet_rescale_ts); + set(av_packet_unref); + set(avcodec_alloc_context3); + set(avcodec_configuration); + set(avcodec_descriptor_get); + set(avcodec_find_decoder); + set(avcodec_find_decoder_by_name); + set(avcodec_find_encoder); + set(avcodec_find_encoder_by_name); + set(avcodec_flush_buffers); + set(avcodec_free_context); + set(avcodec_get_hw_config); + set(avcodec_get_name); + set(avcodec_open2); + set(avcodec_parameters_alloc); + set(avcodec_parameters_copy); + set(avcodec_parameters_free); + set(avcodec_parameters_from_context); + set(avcodec_parameters_to_context); + set(avcodec_receive_frame); + set(avcodec_receive_packet); + set(avcodec_send_frame); + set(avcodec_send_packet); + set(avcodec_version); +#undef set + +#define set(X) stub.X = (decltype(FFmpegStub::X))libavformat.sym(#X) + set(av_demuxer_iterate); + set(av_dump_format); + set(av_find_best_stream); + set(av_find_input_format); + set(av_guess_frame_rate); + set(av_interleaved_write_frame); + set(av_muxer_iterate); + set(av_read_frame); + set(av_seek_frame); + set(av_write_trailer); + set(avio_alloc_context); + set(avio_enum_protocols); + set(avio_closep); + set(avio_flush); + set(avio_open2); + set(avformat_alloc_context); + set(avformat_alloc_output_context2); + set(avformat_close_input); + set(avformat_find_stream_info); + set(avformat_free_context); + set(avformat_new_stream); + set(avformat_open_input); + set(avformat_version); + set(avformat_write_header); +#undef set + +#define set(X) stub.X = (decltype(FFmpegStub::X))libavdevice.sym(#X) + set(avdevice_register_all); + set(avdevice_version); +#undef set + +#define set(X) stub.X = (decltype(FFmpegStub::X))libavfilter.sym(#X) + set(av_buffersink_get_frame); + set(av_buffersrc_add_frame_flags); + set(avfilter_get_by_name); + set(avfilter_graph_alloc); + set(avfilter_graph_config); + set(avfilter_graph_create_filter); + set(avfilter_graph_free); + set(avfilter_graph_parse_ptr); + set(avfilter_inout_alloc); + set(avfilter_inout_free); + set(avfilter_version); +#undef set + } +}; + +static std::unique_ptr _stub; + +void _init_stub() { +#if defined(_WIN32) + _stub = std::make_unique( + "avutil-" AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dll", + "avcodec-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dll", + "avformat-" AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dll", + "avdevice-" AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dll", + "avfilter-" AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dll"); +#elif defined(__APPLE__) + _stub = std::make_unique( + "libavutil." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dylib", + "libavcodec." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dylib", + "libavformat." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dylib", + "libavdevice." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dylib", + "libavfilter." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dylib"); +#else + _stub = std::make_unique( + "libavutil.so." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR), + "libavcodec.so." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR), + "libavformat.so." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR), + "libavdevice.so." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR), + "libavfilter.so." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR)); +#endif +} + +} // namespace + +FFmpegStub& ffmpeg_stub() { + static c10::once_flag init_flag; + c10::call_once(init_flag, _init_stub); + return _stub->stub; +} + +} // namespace torchaudio::io::detail + +#endif diff --git a/torchaudio/csrc/ffmpeg/stub.h b/torchaudio/csrc/ffmpeg/stub.h new file mode 100644 index 00000000000..ae6e0a3d1c2 --- /dev/null +++ b/torchaudio/csrc/ffmpeg/stub.h @@ -0,0 +1,313 @@ +#pragma once + +// Abstraction of the access to FFmpeg libraries. +// +// Do not include this in header files. +// Include this header in implementation files and prepend +// all the calls to libav functions with FFMPEG macro. +// +// If DLOPEN_FFMPEG is not defined, FFMPEG macro is empty. +// In this case, FFmpeg libraries are linked at the time torchaudio is built. +// +// If DLOPEN_FFMPEG is defined, FFMPEG macro becomes a function call to +// fetch a stub instance of FFmpeg libraries. +// This function also initializes the function pointers by automatically +// dlopens all the required libraries. +// + +#ifndef DLOPEN_FFMPEG +#define FFMPEG +#else +#define FFMPEG detail::ffmpeg_stub(). + +#include + +namespace torchaudio::io::detail { + +struct FFmpegStub; + +// dlopen FFmpeg libraries and populate the methods of stub instance, +// then return the reference to the stub instance +FFmpegStub& ffmpeg_stub(); + +struct FFmpegStub { + ///////////////////////////////////////////////////////////////////////////// + // libavutil + ///////////////////////////////////////////////////////////////////////////// + + AVBufferRef* (*av_buffer_ref)(const AVBufferRef*); + + void (*av_buffer_unref)(AVBufferRef**); + + AVRational (*av_d2q)(double, int) av_const; + + void (*av_dict_free)(AVDictionary**); + + AVDictionaryEntry* (*av_dict_get)( + const AVDictionary*, + const char*, + const AVDictionaryEntry*, + int); + + int (*av_dict_set)(AVDictionary**, const char*, const char*, int); + + AVFrame* (*av_frame_alloc)(); + + void (*av_frame_free)(AVFrame**); + + int (*av_frame_get_buffer)(AVFrame*, int); + + int (*av_frame_is_writable)(AVFrame*); + + int (*av_frame_make_writable)(AVFrame*); + + void (*av_frame_unref)(AVFrame*); + + void (*av_freep)(void*); + + int (*av_get_channel_layout_nb_channels)(uint64_t); + + const char* (*av_get_channel_name)(uint64_t); + + int64_t (*av_get_default_channel_layout)(int); + + const char* (*av_get_media_type_string)(enum AVMediaType); + + enum AVPixelFormat (*av_get_pix_fmt)(const char*); + + const char* (*av_get_pix_fmt_name)(enum AVPixelFormat); + + enum AVSampleFormat (*av_get_sample_fmt)(const char*); + + const char* (*av_get_sample_fmt_name)(enum AVSampleFormat); + + AVRational (*av_get_time_base_q)(); + + int (*av_hwdevice_ctx_create)( + AVBufferRef**, + enum AVHWDeviceType, + const char*, + AVDictionary*, + int); + + AVBufferRef* (*av_hwframe_ctx_alloc)(AVBufferRef*); + + int (*av_hwframe_ctx_init)(AVBufferRef*); + + int (*av_hwframe_get_buffer)(AVBufferRef*, AVFrame*, int); + + int (*av_log_get_level)(); + + void (*av_log_set_level)(int); + + void* (*av_malloc)(size_t); + + const AVPixFmtDescriptor* (*av_pix_fmt_desc_get)(enum AVPixelFormat); + + int64_t (*av_rescale_q)(int64_t, AVRational, AVRational) av_const; + + int (*av_sample_fmt_is_planar)(enum AVSampleFormat); + + char* (*av_strdup)(const char*); + + int (*av_strerror)(int, char*, size_t); + + unsigned (*avutil_version)(); + + ///////////////////////////////////////////////////////////////////////////// + // libavcodec + ///////////////////////////////////////////////////////////////////////////// + + int (*av_codec_is_decoder)(const AVCodec*); + + int (*av_codec_is_encoder)(const AVCodec*); + + const AVCodec* (*av_codec_iterate)(void**); + + AVPacket* (*av_packet_alloc)(); + + AVPacket* (*av_packet_clone)(const AVPacket*); + + void (*av_packet_free)(AVPacket**); + + int (*av_packet_ref)(AVPacket*, const AVPacket*); + + void (*av_packet_rescale_ts)(AVPacket*, AVRational, AVRational); + + void (*av_packet_unref)(AVPacket*); + + AVCodecContext* (*avcodec_alloc_context3)(const AVCodec*); + + const char* (*avcodec_configuration)(); + + const AVCodecDescriptor* (*avcodec_descriptor_get)(enum AVCodecID); + + AVCodec* (*avcodec_find_decoder)(enum AVCodecID); + + AVCodec* (*avcodec_find_decoder_by_name)(const char*); + + AVCodec* (*avcodec_find_encoder)(enum AVCodecID); + + AVCodec* (*avcodec_find_encoder_by_name)(const char*); + + void (*avcodec_flush_buffers)(AVCodecContext*); + + void (*avcodec_free_context)(AVCodecContext**); + + const AVCodecHWConfig* (*avcodec_get_hw_config)(const AVCodec*, int); + + const char* (*avcodec_get_name)(enum AVCodecID); + + int (*avcodec_open2)(AVCodecContext*, const AVCodec*, AVDictionary**); + + AVCodecParameters* (*avcodec_parameters_alloc)(); + + int (*avcodec_parameters_copy)(AVCodecParameters*, const AVCodecParameters*); + + void (*avcodec_parameters_free)(AVCodecParameters**); + + int (*avcodec_parameters_from_context)( + AVCodecParameters*, + const AVCodecContext*); + + int (*avcodec_parameters_to_context)( + AVCodecContext*, + const AVCodecParameters*); + + int (*avcodec_receive_frame)(AVCodecContext*, AVFrame*); + + int (*avcodec_receive_packet)(AVCodecContext*, AVPacket*); + + int (*avcodec_send_frame)(AVCodecContext*, const AVFrame*); + + int (*avcodec_send_packet)(AVCodecContext*, const AVPacket*); + + unsigned (*avcodec_version)(); + + ///////////////////////////////////////////////////////////////////////////// + // libavformat + ///////////////////////////////////////////////////////////////////////////// + + const AVInputFormat* (*av_demuxer_iterate)(void**); + + void (*av_dump_format)(AVFormatContext*, int, const char*, int); + + int (*av_find_best_stream)( + AVFormatContext*, + enum AVMediaType, + int, + int, + AVCodec**, + int); + + AVInputFormat* (*av_find_input_format)(const char*); + + AVRational (*av_guess_frame_rate)(AVFormatContext*, AVStream*, AVFrame*); + + int (*av_interleaved_write_frame)(AVFormatContext*, AVPacket*); + + const AVOutputFormat* (*av_muxer_iterate)(void**); + + int (*av_read_frame)(AVFormatContext*, AVPacket*); + + int (*av_seek_frame)(AVFormatContext*, int, int64_t, int); + + int (*av_write_trailer)(AVFormatContext* s); + + AVIOContext* (*avio_alloc_context)( + unsigned char*, + int, + int, + void*, + int (*)(void*, uint8_t*, int), + int (*)(void*, uint8_t*, int), + int64_t (*)(void*, int64_t, int)); + + const char* (*avio_enum_protocols)(void**, int); + + int (*avio_closep)(AVIOContext**); + + void (*avio_flush)(AVIOContext*); + + int (*avio_open2)( + AVIOContext**, + const char*, + int, + const AVIOInterruptCB*, + AVDictionary**); + + AVFormatContext* (*avformat_alloc_context)(); + + int (*avformat_alloc_output_context2)( + AVFormatContext**, + AVOutputFormat*, + const char*, + const char*); + + void (*avformat_close_input)(AVFormatContext**); + + int (*avformat_find_stream_info)(AVFormatContext*, AVDictionary**); + + void (*avformat_free_context)(AVFormatContext*); + + AVStream* (*avformat_new_stream)(AVFormatContext*, const AVCodec*); + + int (*avformat_open_input)( + AVFormatContext**, + const char*, + AVFORMAT_CONST AVInputFormat*, + AVDictionary**); + + unsigned (*avformat_version)(); + + int (*avformat_write_header)(AVFormatContext*, AVDictionary**); + + ///////////////////////////////////////////////////////////////////////////// + // libavdevice + ///////////////////////////////////////////////////////////////////////////// + + void (*avdevice_register_all)(); + + unsigned (*avdevice_version)(); + + ///////////////////////////////////////////////////////////////////////////// + // libavfilter + ///////////////////////////////////////////////////////////////////////////// + + int (*av_buffersink_get_frame)(AVFilterContext*, AVFrame*); + + int (*av_buffersrc_add_frame_flags)(AVFilterContext*, AVFrame*, int); + + const AVFilter* (*avfilter_get_by_name)(const char*); + + AVFilterGraph* (*avfilter_graph_alloc)(); + + int (*avfilter_graph_config)(AVFilterGraph*, void*); + + int (*avfilter_graph_create_filter)( + AVFilterContext**, + const AVFilter*, + const char*, + const char*, + void*, + AVFilterGraph*); + + void (*avfilter_graph_free)(AVFilterGraph**); + + int (*avfilter_graph_parse_ptr)( + AVFilterGraph*, + const char*, + AVFilterInOut**, + AVFilterInOut**, + void*); + + AVFilterInOut* (*avfilter_inout_alloc)(); + + void (*avfilter_inout_free)(AVFilterInOut**); + + unsigned (*avfilter_version)(); +}; + +} // namespace torchaudio::io::detail + +#endif