Use dlopen for FFmpeg (pytorch#3353)

Summary: This commit changes the way FFmpeg extension is built and used. Instead of linking (LGPL) FFmpeg libraries to torchaudio at build time, It uses dlopen to search and link them at run time. For dlopen-ing, we use PyTorch's `at::DynamicLibrary` class, which provides portable wrapper. Pull Request resolved: pytorch#3353 Differential Revision: D46059199 Pulled By: mthrok fbshipit-source-id: 4493a5fd8a4c802178d20276522f5334d637307d
mthrok · Jun 1, 2023 · b14ced1 · b14ced1
1 parent bc54ac8
commit b14ced1
Show file tree

Hide file tree

Showing 19 changed files with 789 additions and 216 deletions.
diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -2,11 +2,10 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
 find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)
 add_library(ffmpeg INTERFACE)
 target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}")
-target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}")
-
 
 set(
   sources
+  libav.cpp
   ffmpeg.cpp
   filter_graph.cpp
   hw_context.cpp

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -1,5 +1,6 @@
 #include <c10/util/Exception.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -8,14 +9,16 @@
 namespace torchaudio {
 namespace io {
 
+using torchaudio::io::detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // AVDictionary
 ////////////////////////////////////////////////////////////////////////////////
 AVDictionary* get_option_dict(const c10::optional<OptionDict>& option) {
   AVDictionary* opt = nullptr;
   if (option) {
     for (auto const& [key, value] : option.value()) {
-      av_dict_set(&opt, key.c_str(), value.c_str(), 0);
+      libav().av_dict_set(&opt, key.c_str(), value.c_str(), 0);
     }
   }
   return opt;
@@ -26,10 +29,10 @@ void clean_up_dict(AVDictionary* p) {
     std::vector<std::string> unused_keys;
     // Check and copy unused keys, clean up the original dictionary
     AVDictionaryEntry* t = nullptr;
-    while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
+    while ((t = libav().av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
       unused_keys.emplace_back(t->key);
     }
-    av_dict_free(&p);
+    libav().av_dict_free(&p);
     TORCH_CHECK(
         unused_keys.empty(),
         "Unexpected options: ",
@@ -41,14 +44,14 @@ void clean_up_dict(AVDictionary* p) {
 // AVFormatContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVFormatInputContextDeleter::operator()(AVFormatContext* p) {
-  avformat_close_input(&p);
+  libav().avformat_close_input(&p);
 };
 
 AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p)
     : Wrapper<AVFormatContext, AVFormatInputContextDeleter>(p) {}
 
 void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) {
-  avformat_free_context(p);
+  libav().avformat_free_context(p);
 };
 
 AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
@@ -58,9 +61,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
 // AVIO
 ////////////////////////////////////////////////////////////////////////////////
 void AVIOContextDeleter::operator()(AVIOContext* p) {
-  avio_flush(p);
-  av_freep(&p->buffer);
-  av_freep(&p);
+  libav().avio_flush(p);
+  libav().av_freep(&p->buffer);
+  libav().av_freep(&p);
 };
 
 AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
@@ -70,13 +73,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
 // AVPacket
 ////////////////////////////////////////////////////////////////////////////////
 void AVPacketDeleter::operator()(AVPacket* p) {
-  av_packet_free(&p);
+  libav().av_packet_free(&p);
 };
 
 AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper<AVPacket, AVPacketDeleter>(p) {}
 
 AVPacketPtr alloc_avpacket() {
-  AVPacket* p = av_packet_alloc();
+  AVPacket* p = libav().av_packet_alloc();
   TORCH_CHECK(p, "Failed to allocate AVPacket object.");
   return AVPacketPtr{p};
 }
@@ -86,7 +89,7 @@ AVPacketPtr alloc_avpacket() {
 ////////////////////////////////////////////////////////////////////////////////
 AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){};
 AutoPacketUnref::~AutoPacketUnref() {
-  av_packet_unref(p_);
+  libav().av_packet_unref(p_);
 }
 AutoPacketUnref::operator AVPacket*() const {
   return p_;
@@ -96,13 +99,13 @@ AutoPacketUnref::operator AVPacket*() const {
 // AVFrame
 ////////////////////////////////////////////////////////////////////////////////
 void AVFrameDeleter::operator()(AVFrame* p) {
-  av_frame_free(&p);
+  libav().av_frame_free(&p);
 };
 
 AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper<AVFrame, AVFrameDeleter>(p) {}
 
 AVFramePtr alloc_avframe() {
-  AVFrame* p = av_frame_alloc();
+  AVFrame* p = libav().av_frame_alloc();
   TORCH_CHECK(p, "Failed to allocate AVFrame object.");
   return AVFramePtr{p};
 };
@@ -111,7 +114,7 @@ AVFramePtr alloc_avframe() {
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecContextDeleter::operator()(AVCodecContext* p) {
-  avcodec_free_context(&p);
+  libav().avcodec_free_context(&p);
 };
 
 AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
@@ -121,7 +124,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
 // AVBufferRefPtr
 ////////////////////////////////////////////////////////////////////////////////
 void AutoBufferUnref::operator()(AVBufferRef* p) {
-  av_buffer_unref(&p);
+  libav().av_buffer_unref(&p);
 }
 
 AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
@@ -131,7 +134,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
 // AVFilterGraph
 ////////////////////////////////////////////////////////////////////////////////
 void AVFilterGraphDeleter::operator()(AVFilterGraph* p) {
-  avfilter_graph_free(&p);
+  libav().avfilter_graph_free(&p);
 };
 
 AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
@@ -141,7 +144,7 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
 // AVCodecParameters
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) {
-  avcodec_parameters_free(&codecpar);
+  libav().avcodec_parameters_free(&codecpar);
 }
 
 AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p)

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -6,6 +6,9 @@
 #include <memory>
 #include <string>
 
+#include <torchaudio/csrc/ffmpeg/libav.h>
+#include <torchaudio/csrc/ffmpeg/macro.h>
+
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavdevice/avdevice.h>
@@ -29,21 +32,13 @@ namespace io {
 
 using OptionDict = std::map<std::string, std::string>;
 
-// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260
-// Starting from libavformat 59 (ffmpeg 5),
-// AVInputFormat is const and related functions expect constant.
-#if LIBAVFORMAT_VERSION_MAJOR >= 59
-#define AVFORMAT_CONST const
-#else
-#define AVFORMAT_CONST
-#endif
-
 // Replacement of av_err2str, which causes
 // `error: taking address of temporary array`
 // https://github.com/joncampbell123/composite-video-simulator/issues/5
 av_always_inline std::string av_err2string(int errnum) {
   char str[AV_ERROR_MAX_STRING_SIZE];
-  return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum);
+  detail::libav().av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE);
+  return str;
 }
 
 // Base structure that handles memory management.

diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -1,12 +1,15 @@
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <stdexcept>
 
 namespace torchaudio {
 namespace io {
 
+using torchaudio::io::detail::libav;
+
 namespace {
 AVFilterGraph* get_filter_graph() {
-  AVFilterGraph* ptr = avfilter_graph_alloc();
+  AVFilterGraph* ptr = libav().avfilter_graph_alloc();
   TORCH_CHECK(ptr, "Failed to allocate resouce.");
   ptr->nb_threads = 1;
   return ptr;
@@ -32,7 +35,7 @@ std::string get_audio_src_args(
       time_base.num,
       time_base.den,
       sample_rate,
-      av_get_sample_fmt_name(format),
+      libav().av_get_sample_fmt_name(format),
       channel_layout);
   return std::string(args);
 }
@@ -51,7 +54,7 @@ std::string get_video_src_args(
       "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d",
       width,
       height,
-      av_get_pix_fmt_name(format),
+      libav().av_get_pix_fmt_name(format),
       time_base.num,
       time_base.den,
       frame_rate.num,
@@ -69,7 +72,7 @@ void FilterGraph::add_audio_src(
     int sample_rate,
     uint64_t channel_layout) {
   add_src(
-      avfilter_get_by_name("abuffer"),
+      libav().avfilter_get_by_name("abuffer"),
       get_audio_src_args(format, time_base, sample_rate, channel_layout));
 }
 
@@ -81,13 +84,13 @@ void FilterGraph::add_video_src(
     int height,
     AVRational sample_aspect_ratio) {
   add_src(
-      avfilter_get_by_name("buffer"),
+      libav().avfilter_get_by_name("buffer"),
       get_video_src_args(
           format, time_base, frame_rate, width, height, sample_aspect_ratio));
 }
 
 void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
-  int ret = avfilter_graph_create_filter(
+  int ret = libav().avfilter_graph_create_filter(
       &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph);
   TORCH_CHECK(
       ret >= 0,
@@ -96,11 +99,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
 }
 
 void FilterGraph::add_audio_sink() {
-  add_sink(avfilter_get_by_name("abuffersink"));
+  add_sink(libav().avfilter_get_by_name("abuffersink"));
 }
 
 void FilterGraph::add_video_sink() {
-  add_sink(avfilter_get_by_name("buffersink"));
+  add_sink(libav().avfilter_get_by_name("buffersink"));
 }
 
 void FilterGraph::add_sink(const AVFilter* buffersink) {
@@ -114,7 +117,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) {
   // According to the other example
   // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html
   // `abuffersink` should not take options, and this resolved issue.
-  int ret = avfilter_graph_create_filter(
+  int ret = libav().avfilter_graph_create_filter(
       &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph);
   TORCH_CHECK(ret >= 0, "Failed to create output filter.");
 }
@@ -131,15 +134,15 @@ class InOuts {
 
  public:
   InOuts(const char* name, AVFilterContext* pCtx) {
-    p = avfilter_inout_alloc();
+    p = libav().avfilter_inout_alloc();
     TORCH_CHECK(p, "Failed to allocate AVFilterInOut.");
-    p->name = av_strdup(name);
+    p->name = libav().av_strdup(name);
     p->filter_ctx = pCtx;
     p->pad_idx = 0;
     p->next = nullptr;
   }
   ~InOuts() {
-    avfilter_inout_free(&p);
+    libav().avfilter_inout_free(&p);
   }
   operator AVFilterInOut**() {
     return &p;
@@ -156,7 +159,7 @@ void FilterGraph::add_process(const std::string& filter_description) {
   // If you are debugging this part of the code, you might get confused.
   InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx};
 
-  int ret = avfilter_graph_parse_ptr(
+  int ret = libav().avfilter_graph_parse_ptr(
       graph, filter_description.c_str(), out, in, nullptr);
 
   TORCH_CHECK(
@@ -167,11 +170,11 @@ void FilterGraph::add_process(const std::string& filter_description) {
 
 void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
   buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
-  int ret = avfilter_graph_config(graph, nullptr);
+  int ret = libav().avfilter_graph_config(graph, nullptr);
   TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
-  // char* desc = avfilter_graph_dump(graph, NULL);
+  // char* desc = libav().avfilter_graph_dump(graph, NULL);
   // std::cerr << "Filter created:\n" << desc << std::endl;
-  // av_free(static_cast<void*>(desc));
+  // libav().av_free(static_cast<void*>(desc));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -191,7 +194,8 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
       ret.num_channels = l->ch_layout.nb_channels;
 #else
       // Before FFmpeg 5.1
-      ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
+      ret.num_channels =
+          libav().av_get_channel_layout_nb_channels(l->channel_layout);
 #endif
       break;
     }
@@ -214,12 +218,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
 // Streaming process
 //////////////////////////////////////////////////////////////////////////////
 int FilterGraph::add_frame(AVFrame* pInputFrame) {
-  return av_buffersrc_add_frame_flags(
+  return libav().av_buffersrc_add_frame_flags(
       buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF);
 }
 
 int FilterGraph::get_frame(AVFrame* pOutputFrame) {
-  return av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
+  return libav().av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
 }
 
 } // namespace io

diff --git a/torchaudio/csrc/ffmpeg/hw_context.cpp b/torchaudio/csrc/ffmpeg/hw_context.cpp
@@ -1,6 +1,10 @@
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 
 namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {
 
 static std::mutex MUTEX;
@@ -15,7 +19,7 @@ AVBufferRef* get_cuda_context(int index) {
   }
   if (CUDA_CONTEXT_CACHE.count(index) == 0) {
     AVBufferRef* p = nullptr;
-    int ret = av_hwdevice_ctx_create(
+    int ret = libav().av_hwdevice_ctx_create(
         &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0);
     TORCH_CHECK(
         ret >= 0,