diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt
index 8b69984cee3..f0182b17a17 100644
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -2,8 +2,6 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
 find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)
 add_library(ffmpeg INTERFACE)
 target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}")
-target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}")
-
 
 set(
   sources
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
index 66bd222c050..a465e91ab06 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -1,5 +1,6 @@
 #include <c10/util/Exception.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -8,6 +9,8 @@
 namespace torchaudio {
 namespace io {
 
+using torchaudio::io::detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // AVDictionary
 ////////////////////////////////////////////////////////////////////////////////
@@ -15,7 +18,7 @@ AVDictionary* get_option_dict(const c10::optional<OptionDict>& option) {
   AVDictionary* opt = nullptr;
   if (option) {
     for (auto const& [key, value] : option.value()) {
-      av_dict_set(&opt, key.c_str(), value.c_str(), 0);
+      libav().av_dict_set(&opt, key.c_str(), value.c_str(), 0);
     }
   }
   return opt;
@@ -26,10 +29,10 @@ void clean_up_dict(AVDictionary* p) {
     std::vector<std::string> unused_keys;
     // Check and copy unused keys, clean up the original dictionary
     AVDictionaryEntry* t = nullptr;
-    while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
+    while ((t = libav().av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
       unused_keys.emplace_back(t->key);
     }
-    av_dict_free(&p);
+    libav().av_dict_free(&p);
     TORCH_CHECK(
         unused_keys.empty(),
         "Unexpected options: ",
@@ -41,14 +44,14 @@ void clean_up_dict(AVDictionary* p) {
 // AVFormatContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVFormatInputContextDeleter::operator()(AVFormatContext* p) {
-  avformat_close_input(&p);
+  libav().avformat_close_input(&p);
 };
 
 AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p)
     : Wrapper<AVFormatContext, AVFormatInputContextDeleter>(p) {}
 
 void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) {
-  avformat_free_context(p);
+  libav().avformat_free_context(p);
 };
 
 AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
@@ -58,9 +61,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
 // AVIO
 ////////////////////////////////////////////////////////////////////////////////
 void AVIOContextDeleter::operator()(AVIOContext* p) {
-  avio_flush(p);
-  av_freep(&p->buffer);
-  av_freep(&p);
+  libav().avio_flush(p);
+  libav().av_freep(&p->buffer);
+  libav().av_freep(&p);
 };
 
 AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
@@ -70,13 +73,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
 // AVPacket
 ////////////////////////////////////////////////////////////////////////////////
 void AVPacketDeleter::operator()(AVPacket* p) {
-  av_packet_free(&p);
+  libav().av_packet_free(&p);
 };
 
 AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper<AVPacket, AVPacketDeleter>(p) {}
 
 AVPacketPtr alloc_avpacket() {
-  AVPacket* p = av_packet_alloc();
+  AVPacket* p = libav().av_packet_alloc();
   TORCH_CHECK(p, "Failed to allocate AVPacket object.");
   return AVPacketPtr{p};
 }
@@ -86,7 +89,7 @@ AVPacketPtr alloc_avpacket() {
 ////////////////////////////////////////////////////////////////////////////////
 AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){};
 AutoPacketUnref::~AutoPacketUnref() {
-  av_packet_unref(p_);
+  libav().av_packet_unref(p_);
 }
 AutoPacketUnref::operator AVPacket*() const {
   return p_;
@@ -96,13 +99,13 @@ AutoPacketUnref::operator AVPacket*() const {
 // AVFrame
 ////////////////////////////////////////////////////////////////////////////////
 void AVFrameDeleter::operator()(AVFrame* p) {
-  av_frame_free(&p);
+  libav().av_frame_free(&p);
 };
 
 AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper<AVFrame, AVFrameDeleter>(p) {}
 
 AVFramePtr alloc_avframe() {
-  AVFrame* p = av_frame_alloc();
+  AVFrame* p = libav().av_frame_alloc();
   TORCH_CHECK(p, "Failed to allocate AVFrame object.");
   return AVFramePtr{p};
 };
@@ -111,7 +114,7 @@ AVFramePtr alloc_avframe() {
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecContextDeleter::operator()(AVCodecContext* p) {
-  avcodec_free_context(&p);
+  libav().avcodec_free_context(&p);
 };
 
 AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
@@ -121,7 +124,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
 // AVBufferRefPtr
 ////////////////////////////////////////////////////////////////////////////////
 void AutoBufferUnref::operator()(AVBufferRef* p) {
-  av_buffer_unref(&p);
+  libav().av_buffer_unref(&p);
 }
 
 AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
@@ -131,7 +134,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
 // AVFilterGraph
 ////////////////////////////////////////////////////////////////////////////////
 void AVFilterGraphDeleter::operator()(AVFilterGraph* p) {
-  avfilter_graph_free(&p);
+  libav().avfilter_graph_free(&p);
 };
 
 AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
@@ -141,7 +144,7 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
 // AVCodecParameters
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) {
-  avcodec_parameters_free(&codecpar);
+  libav().avcodec_parameters_free(&codecpar);
 }
 
 AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p)
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
index 0bae00c12d8..9c40d7fe117 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.h
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -6,6 +6,9 @@
 #include <memory>
 #include <string>
 
+#include <torchaudio/csrc/ffmpeg/macro.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
+
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavdevice/avdevice.h>
@@ -29,21 +32,13 @@ namespace io {
 
 using OptionDict = std::map<std::string, std::string>;
 
-// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260
-// Starting from libavformat 59 (ffmpeg 5),
-// AVInputFormat is const and related functions expect constant.
-#if LIBAVFORMAT_VERSION_MAJOR >= 59
-#define AVFORMAT_CONST const
-#else
-#define AVFORMAT_CONST
-#endif
-
 // Replacement of av_err2str, which causes
 // `error: taking address of temporary array`
 // https://github.com/joncampbell123/composite-video-simulator/issues/5
 av_always_inline std::string av_err2string(int errnum) {
   char str[AV_ERROR_MAX_STRING_SIZE];
-  return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum);
+  detail::libav().av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE);
+  return str;
 }
 
 // Base structure that handles memory management.
diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp
index 797f0783494..ca4c92bb5c7 100644
--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -1,12 +1,15 @@
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <stdexcept>
 
 namespace torchaudio {
 namespace io {
 
+using torchaudio::io::detail::libav;
+
 namespace {
 AVFilterGraph* get_filter_graph() {
-  AVFilterGraph* ptr = avfilter_graph_alloc();
+  AVFilterGraph* ptr = libav().avfilter_graph_alloc();
   TORCH_CHECK(ptr, "Failed to allocate resouce.");
   ptr->nb_threads = 1;
   return ptr;
@@ -32,7 +35,7 @@ std::string get_audio_src_args(
       time_base.num,
       time_base.den,
       sample_rate,
-      av_get_sample_fmt_name(format),
+      libav().av_get_sample_fmt_name(format),
       channel_layout);
   return std::string(args);
 }
@@ -51,7 +54,7 @@ std::string get_video_src_args(
       "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d",
       width,
       height,
-      av_get_pix_fmt_name(format),
+      libav().av_get_pix_fmt_name(format),
       time_base.num,
       time_base.den,
       frame_rate.num,
@@ -69,7 +72,7 @@ void FilterGraph::add_audio_src(
     int sample_rate,
     uint64_t channel_layout) {
   add_src(
-      avfilter_get_by_name("abuffer"),
+      libav().avfilter_get_by_name("abuffer"),
       get_audio_src_args(format, time_base, sample_rate, channel_layout));
 }
 
@@ -81,13 +84,13 @@ void FilterGraph::add_video_src(
     int height,
     AVRational sample_aspect_ratio) {
   add_src(
-      avfilter_get_by_name("buffer"),
+      libav().avfilter_get_by_name("buffer"),
       get_video_src_args(
           format, time_base, frame_rate, width, height, sample_aspect_ratio));
 }
 
 void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
-  int ret = avfilter_graph_create_filter(
+  int ret = libav().avfilter_graph_create_filter(
       &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph);
   TORCH_CHECK(
       ret >= 0,
@@ -96,11 +99,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
 }
 
 void FilterGraph::add_audio_sink() {
-  add_sink(avfilter_get_by_name("abuffersink"));
+  add_sink(libav().avfilter_get_by_name("abuffersink"));
 }
 
 void FilterGraph::add_video_sink() {
-  add_sink(avfilter_get_by_name("buffersink"));
+  add_sink(libav().avfilter_get_by_name("buffersink"));
 }
 
 void FilterGraph::add_sink(const AVFilter* buffersink) {
@@ -114,7 +117,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) {
   // According to the other example
   // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html
   // `abuffersink` should not take options, and this resolved issue.
-  int ret = avfilter_graph_create_filter(
+  int ret = libav().avfilter_graph_create_filter(
       &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph);
   TORCH_CHECK(ret >= 0, "Failed to create output filter.");
 }
@@ -131,15 +134,15 @@ class InOuts {
 
  public:
   InOuts(const char* name, AVFilterContext* pCtx) {
-    p = avfilter_inout_alloc();
+    p = libav().avfilter_inout_alloc();
     TORCH_CHECK(p, "Failed to allocate AVFilterInOut.");
-    p->name = av_strdup(name);
+    p->name = libav().av_strdup(name);
     p->filter_ctx = pCtx;
     p->pad_idx = 0;
     p->next = nullptr;
   }
   ~InOuts() {
-    avfilter_inout_free(&p);
+    libav().avfilter_inout_free(&p);
   }
   operator AVFilterInOut**() {
     return &p;
@@ -156,7 +159,7 @@ void FilterGraph::add_process(const std::string& filter_description) {
   // If you are debugging this part of the code, you might get confused.
   InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx};
 
-  int ret = avfilter_graph_parse_ptr(
+  int ret = libav().avfilter_graph_parse_ptr(
       graph, filter_description.c_str(), out, in, nullptr);
 
   TORCH_CHECK(
@@ -167,11 +170,11 @@ void FilterGraph::add_process(const std::string& filter_description) {
 
 void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
   buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
-  int ret = avfilter_graph_config(graph, nullptr);
+  int ret = libav().avfilter_graph_config(graph, nullptr);
   TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
-  // char* desc = avfilter_graph_dump(graph, NULL);
+  // char* desc = libav().avfilter_graph_dump(graph, NULL);
   // std::cerr << "Filter created:\n" << desc << std::endl;
-  // av_free(static_cast<void*>(desc));
+  // libav().av_free(static_cast<void*>(desc));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -191,7 +194,7 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
       ret.num_channels = l->ch_layout.nb_channels;
 #else
       // Before FFmpeg 5.1
-      ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
+      ret.num_channels = libav().av_get_channel_layout_nb_channels(l->channel_layout);
 #endif
       break;
     }
@@ -214,12 +217,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
 // Streaming process
 //////////////////////////////////////////////////////////////////////////////
 int FilterGraph::add_frame(AVFrame* pInputFrame) {
-  return av_buffersrc_add_frame_flags(
+  return libav().av_buffersrc_add_frame_flags(
       buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF);
 }
 
 int FilterGraph::get_frame(AVFrame* pOutputFrame) {
-  return av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
+  return libav().av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
 }
 
 } // namespace io
diff --git a/torchaudio/csrc/ffmpeg/hw_context.cpp b/torchaudio/csrc/ffmpeg/hw_context.cpp
index a1d7f3c7a04..7341b8e0746 100644
--- a/torchaudio/csrc/ffmpeg/hw_context.cpp
+++ b/torchaudio/csrc/ffmpeg/hw_context.cpp
@@ -1,6 +1,10 @@
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 
 namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {
 
 static std::mutex MUTEX;
@@ -15,7 +19,7 @@ AVBufferRef* get_cuda_context(int index) {
   }
   if (CUDA_CONTEXT_CACHE.count(index) == 0) {
     AVBufferRef* p = nullptr;
-    int ret = av_hwdevice_ctx_create(
+    int ret = libav().av_hwdevice_ctx_create(
         &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0);
     TORCH_CHECK(
         ret >= 0,
diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
index 7ccc7bd0bf9..3470d3b3ff6 100644
--- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -1,10 +1,14 @@
 #include <torch/extension.h>
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
 
 namespace torchaudio {
 namespace io {
+
+using detail::libav;
+
 namespace {
 
 std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
@@ -12,7 +16,7 @@ std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
 
 #define add_version(NAME)            \
   {                                  \
-    int ver = NAME##_version();      \
+    int ver = libav().NAME##_version();          \
     ret.emplace(                     \
         "lib" #NAME,                 \
         std::make_tuple<>(           \
@@ -35,7 +39,7 @@ std::map<std::string, std::string> get_demuxers(bool req_device) {
   std::map<std::string, std::string> ret;
   const AVInputFormat* fmt = nullptr;
   void* i = nullptr;
-  while ((fmt = av_demuxer_iterate(&i))) {
+  while ((fmt = libav().av_demuxer_iterate(&i))) {
     assert(fmt);
     bool is_device = [&]() {
       const AVClass* avclass = fmt->priv_class;
@@ -52,7 +56,7 @@ std::map<std::string, std::string> get_muxers(bool req_device) {
   std::map<std::string, std::string> ret;
   const AVOutputFormat* fmt = nullptr;
   void* i = nullptr;
-  while ((fmt = av_muxer_iterate(&i))) {
+  while ((fmt = libav().av_muxer_iterate(&i))) {
     assert(fmt);
     bool is_device = [&]() {
       const AVClass* avclass = fmt->priv_class;
@@ -71,10 +75,10 @@ std::map<std::string, std::string> get_codecs(
   const AVCodec* c = nullptr;
   void* i = nullptr;
   std::map<std::string, std::string> ret;
-  while ((c = av_codec_iterate(&i))) {
+  while ((c = libav().av_codec_iterate(&i))) {
     assert(c);
-    if ((req_encoder && av_codec_is_encoder(c)) ||
-        (!req_encoder && av_codec_is_decoder(c))) {
+    if ((req_encoder && libav().av_codec_is_encoder(c)) ||
+        (!req_encoder && libav().av_codec_is_decoder(c))) {
       if (c->type == type && c->name) {
         ret.emplace(c->name, c->long_name ? c->long_name : "");
       }
@@ -87,7 +91,7 @@ std::vector<std::string> get_protocols(bool output) {
   void* opaque = nullptr;
   const char* name = nullptr;
   std::vector<std::string> ret;
-  while ((name = avio_enum_protocols(&opaque, output))) {
+  while ((name = libav().avio_enum_protocols(&opaque, output))) {
     assert(name);
     ret.emplace_back(name);
   }
@@ -95,7 +99,7 @@ std::vector<std::string> get_protocols(bool output) {
 }
 
 std::string get_build_config() {
-  return avcodec_configuration();
+  return libav().avcodec_configuration();
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -188,9 +192,10 @@ struct StreamWriterFileObj : private FileObj, public StreamWriterCustomIO {
 };
 
 PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
-  m.def("init", []() { avdevice_register_all(); });
-  m.def("get_log_level", []() { return av_log_get_level(); });
-  m.def("set_log_level", [](int level) { av_log_set_level(level); });
+  m.def("test_dlopen", []() { detail::libav(); });
+  m.def("init", []() { libav().avdevice_register_all(); });
+  m.def("get_log_level", []() { return libav().av_log_get_level(); });
+  m.def("set_log_level", [](int level) { libav().av_log_set_level(level); });
   m.def("get_versions", &get_versions);
   m.def("get_muxers", []() { return get_muxers(false); });
   m.def("get_demuxers", []() { return get_demuxers(false); });
@@ -246,21 +251,21 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def_property_readonly(
           "media_type",
           [](const OutputStreamInfo& o) -> std::string {
-            return av_get_media_type_string(o.media_type);
+            return libav().av_get_media_type_string(o.media_type);
           })
       .def_property_readonly(
           "format",
           [](const OutputStreamInfo& o) -> std::string {
             switch (o.media_type) {
               case AVMEDIA_TYPE_AUDIO:
-                return av_get_sample_fmt_name((AVSampleFormat)(o.format));
+                return libav().av_get_sample_fmt_name((AVSampleFormat)(o.format));
               case AVMEDIA_TYPE_VIDEO:
-                return av_get_pix_fmt_name((AVPixelFormat)(o.format));
+                return libav().av_get_pix_fmt_name((AVPixelFormat)(o.format));
               default:
                 TORCH_INTERNAL_ASSERT(
                     false,
                     "FilterGraph is returning unexpected media type: ",
-                    av_get_media_type_string(o.media_type));
+                    libav().av_get_media_type_string(o.media_type));
             }
           })
       .def_readonly("sample_rate", &OutputStreamInfo::sample_rate)
@@ -284,7 +289,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def_property_readonly(
           "media_type",
           [](const SrcStreamInfo& s) {
-            return av_get_media_type_string(s.media_type);
+            return libav().av_get_media_type_string(s.media_type);
           })
       .def_readonly("codec_name", &SrcStreamInfo::codec_name)
       .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name)
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
index 99e33e8367f..469cb315bd9 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
@@ -1,4 +1,5 @@
 #include <torch/torch.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/conversion.h>
 
 #ifdef USE_CUDA
@@ -7,6 +8,8 @@
 
 namespace torchaudio::io {
 
+using detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Audio
 ////////////////////////////////////////////////////////////////////////////////
@@ -429,11 +432,11 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_CUDA == fmt,
       "Expected CUDA frame. Found: ",
-      av_get_pix_fmt_name(fmt));
+      libav().av_get_pix_fmt_name(fmt));
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_NV12 == sw_fmt,
       "Expected NV12 format. Found: ",
-      av_get_pix_fmt_name(sw_fmt));
+      libav().av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
   auto status = cudaMemcpy2D(
@@ -506,11 +509,11 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_CUDA == fmt,
       "Expected CUDA frame. Found: ",
-      av_get_pix_fmt_name(fmt));
+      libav().av_get_pix_fmt_name(fmt));
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_P010 == sw_fmt,
       "Expected P010 format. Found: ",
-      av_get_pix_fmt_name(sw_fmt));
+      libav().av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
   auto status = cudaMemcpy2D(
@@ -581,11 +584,11 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_CUDA == fmt,
       "Expected CUDA frame. Found: ",
-      av_get_pix_fmt_name(fmt));
+      libav().av_get_pix_fmt_name(fmt));
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_YUV444P == sw_fmt,
       "Expected YUV444P format. Found: ",
-      av_get_pix_fmt_name(sw_fmt));
+      libav().av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
   for (int i = 0; i < num_channels; ++i) {
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
index 8caec7cb582..77944c67fb7 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
@@ -1,10 +1,14 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.h>
 
 namespace torchaudio {
 namespace io {
+
+using detail::libav;
+
 void PacketBuffer::push_packet(AVPacket* packet) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null.");
-  AVPacket* p = av_packet_clone(packet);
+  AVPacket* p = libav().av_packet_clone(packet);
   TORCH_INTERNAL_ASSERT(p, "Failed to clone packet.");
   packets.emplace_back(p);
 }
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
index 147d0bc2d57..2cd83746889 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
@@ -1,3 +1,4 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/chunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/conversion.h>
@@ -5,6 +6,9 @@
 
 namespace torchaudio::io {
 namespace detail {
+
+using detail::libav;
+
 namespace {
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -48,7 +52,7 @@ FilterGraphFactory get_video_factory(
     f.add_video_sink();
     f.add_process(filter_desc);
     if (hw_frames_ctx) {
-      f.create_filter(av_buffer_ref(hw_frames_ctx));
+      f.create_filter(libav().av_buffer_ref(hw_frames_ctx));
     } else {
       f.create_filter();
     }
@@ -139,7 +143,7 @@ struct ProcessImpl : public IPostDecodeProcess {
       if (ret >= 0) {
         buffer.push_frame(converter.convert(frame), frame->pts);
       }
-      av_frame_unref(frame);
+      libav().av_frame_unref(frame);
     }
     return ret;
   }
@@ -159,7 +163,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_audio_process(
   TORCH_INTERNAL_ASSERT(
       i.type == AVMEDIA_TYPE_AUDIO,
       "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));
 
   using B = UnchunkedBuffer;
 
@@ -226,7 +230,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_audio_process(
     }
     default:
       TORCH_INTERNAL_ASSERT(
-          false, "Unexpected audio type:", av_get_sample_fmt_name(fmt));
+          false, "Unexpected audio type:", libav().av_get_sample_fmt_name(fmt));
   }
 }
 
@@ -239,7 +243,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_audio_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_AUDIO,
       "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));
 
   using B = ChunkedBuffer;
   B buffer{i.time_base, frames_per_chunk, num_chunks};
@@ -307,7 +311,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_audio_process(
     }
     default:
       TORCH_INTERNAL_ASSERT(
-          false, "Unexpected audio type:", av_get_sample_fmt_name(fmt));
+          false, "Unexpected audio type:", libav().av_get_sample_fmt_name(fmt));
   }
 }
 
@@ -321,7 +325,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));
 
   auto h = i.height;
   auto w = i.width;
@@ -375,7 +379,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
     }
     default: {
       TORCH_INTERNAL_ASSERT(
-          false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt));
+          false, "Unexpected video format found: ", libav().av_get_pix_fmt_name(fmt));
     }
   }
 }
@@ -393,7 +397,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));
 
   using B = UnchunkedBuffer;
   switch (auto fmt = (AVPixelFormat)i.format; fmt) {
@@ -416,13 +420,13 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
       TORCH_CHECK(
           false,
           "Unsupported video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
     }
     default: {
       TORCH_CHECK(
           false,
           "Unexpected video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
     }
   }
 #endif
@@ -437,7 +441,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));
 
   auto h = i.height;
   auto w = i.width;
@@ -491,7 +495,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
     }
     default: {
       TORCH_INTERNAL_ASSERT(
-          false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt));
+          false, "Unexpected video format found: ", libav().av_get_pix_fmt_name(fmt));
     }
   }
 }
@@ -511,7 +515,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));
 
   using B = ChunkedBuffer;
   switch (auto fmt = (AVPixelFormat)i.format; fmt) {
@@ -540,13 +544,13 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
       TORCH_CHECK(
           false,
           "Unsupported video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
     }
     default: {
       TORCH_CHECK(
           false,
           "Unexpected video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
     }
   }
 #endif
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
index a0bf22a0650..272975553d1 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -1,3 +1,4 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h>
 #include <stdexcept>
@@ -6,6 +7,8 @@
 namespace torchaudio {
 namespace io {
 
+using detail::libav;
+
 namespace {
 AVCodecContextPtr alloc_codec_context(
     enum AVCodecID codec_id,
@@ -13,24 +16,24 @@ AVCodecContextPtr alloc_codec_context(
   const AVCodec* codec = [&]() {
     if (decoder_name) {
       const AVCodec* c =
-          avcodec_find_decoder_by_name(decoder_name.value().c_str());
+          libav().avcodec_find_decoder_by_name(decoder_name.value().c_str());
       TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value());
       return c;
     } else {
-      const AVCodec* c = avcodec_find_decoder(codec_id);
-      TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id));
+      const AVCodec* c = libav().avcodec_find_decoder(codec_id);
+      TORCH_CHECK(c, "Unsupported codec: ", libav().avcodec_get_name(codec_id));
       return c;
     }
   }();
 
-  AVCodecContext* codec_ctx = avcodec_alloc_context3(codec);
+  AVCodecContext* codec_ctx = libav().avcodec_alloc_context3(codec);
   TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext.");
   return AVCodecContextPtr(codec_ctx);
 }
 
 const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) {
   for (int i = 0;; ++i) {
-    const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i);
+    const AVCodecHWConfig* config = libav().avcodec_get_hw_config(codec, i);
     if (!config) {
       break;
     }
@@ -83,7 +86,7 @@ enum AVPixelFormat get_hw_format(
 }
 
 AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
-  AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
+  AVBufferRef* p = libav().av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
   TORCH_CHECK(
       p,
       "Failed to allocate CUDA frame context from device context at ",
@@ -94,11 +97,11 @@ AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
   frames_ctx->width = codec_ctx->width;
   frames_ctx->height = codec_ctx->height;
   frames_ctx->initial_pool_size = 5;
-  int ret = av_hwframe_ctx_init(p);
+  int ret = libav().av_hwframe_ctx_init(p);
   if (ret >= 0) {
     return p;
   }
-  av_buffer_unref(&p);
+  libav().av_buffer_unref(&p);
   TORCH_CHECK(
       false, "Failed to initialize CUDA frame context: ", av_err2string(ret));
 }
@@ -107,7 +110,7 @@ void configure_codec_context(
     AVCodecContext* codec_ctx,
     const AVCodecParameters* params,
     const torch::Device& device) {
-  int ret = avcodec_parameters_to_context(codec_ctx, params);
+  int ret = libav().avcodec_parameters_to_context(codec_ctx, params);
   TORCH_CHECK(
       ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret));
 
@@ -122,7 +125,7 @@ void configure_codec_context(
     // 2. Set pCodecContext->get_format call back function which
     // will retrieve the HW pixel format from opaque pointer.
     codec_ctx->get_format = get_hw_format;
-    codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index()));
+    codec_ctx->hw_device_ctx = libav().av_buffer_ref(get_cuda_context(device.index()));
     TORCH_INTERNAL_ASSERT(
         codec_ctx->hw_device_ctx, "Failed to reference HW device context.");
 #endif
@@ -135,16 +138,16 @@ void open_codec(
   AVDictionary* opts = get_option_dict(decoder_option);
 
   // Default to single thread execution.
-  if (!av_dict_get(opts, "threads", nullptr, 0)) {
-    av_dict_set(&opts, "threads", "1", 0);
+  if (!libav().av_dict_get(opts, "threads", nullptr, 0)) {
+    libav().av_dict_set(&opts, "threads", "1", 0);
   }
 
   if (!codec_ctx->channel_layout) {
     codec_ctx->channel_layout =
-        av_get_default_channel_layout(codec_ctx->channels);
+        libav().av_get_default_channel_layout(codec_ctx->channels);
   }
 
-  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
+  int ret = libav().avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
   clean_up_dict(opts);
   TORCH_CHECK(
       ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret));
@@ -260,7 +263,7 @@ void StreamProcessor::remove_stream(KeyType key) {
 void StreamProcessor::set_discard_timestamp(int64_t timestamp) {
   TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative.");
   discard_before_pts =
-      av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base);
+      libav().av_rescale_q(timestamp, libav().av_get_time_base_q(), stream_time_base);
 }
 
 void StreamProcessor::set_decoder(
@@ -306,9 +309,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       is_decoder_set(),
       "Decoder must have been set prior to calling this function.");
-  int ret = avcodec_send_packet(codec_ctx, packet);
+  int ret = libav().avcodec_send_packet(codec_ctx, packet);
   while (ret >= 0) {
-    ret = avcodec_receive_frame(codec_ctx, frame);
+    ret = libav().avcodec_receive_frame(codec_ctx, frame);
     //  AVERROR(EAGAIN) means that new input data is required to return new
     //  output.
     if (ret == AVERROR(EAGAIN))
@@ -355,7 +358,7 @@ int StreamProcessor::process_packet(AVPacket* packet) {
     }
 
     // else we can just unref the frame and continue
-    av_frame_unref(frame);
+    libav().av_frame_unref(frame);
   }
   return ret;
 }
@@ -364,7 +367,7 @@ void StreamProcessor::flush() {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       is_decoder_set(),
       "Decoder must have been set prior to calling this function.");
-  avcodec_flush_buffers(codec_ctx);
+  libav().avcodec_flush_buffers(codec_ctx);
   for (auto& ite : post_processes) {
     ite.second->flush();
   }
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
index 0eec327aa51..5a2143e2730 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
@@ -1,13 +1,19 @@
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <chrono>
 #include <sstream>
 #include <stdexcept>
 #include <thread>
 
+extern "C" {
+#include <libavutil/rational.h>
+}
+
 namespace torchaudio {
 namespace io {
 
+using detail::libav;
 using KeyType = StreamProcessor::KeyType;
 
 //////////////////////////////////////////////////////////////////////////////
@@ -19,7 +25,7 @@ AVFormatContext* get_input_format_context(
     const c10::optional<std::string>& format,
     const c10::optional<OptionDict>& option,
     AVIOContext* io_ctx) {
-  AVFormatContext* p = avformat_alloc_context();
+  AVFormatContext* p = libav().avformat_alloc_context();
   TORCH_CHECK(p, "Failed to allocate AVFormatContext.");
   if (io_ctx) {
     p->pb = io_ctx;
@@ -29,7 +35,7 @@ AVFormatContext* get_input_format_context(
     if (format.has_value()) {
       std::string format_str = format.value();
       AVFORMAT_CONST AVInputFormat* pInput =
-          av_find_input_format(format_str.c_str());
+          libav().av_find_input_format(format_str.c_str());
       TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\"");
       return pInput;
     }
@@ -37,7 +43,7 @@ AVFormatContext* get_input_format_context(
   }();
 
   AVDictionary* opt = get_option_dict(option);
-  int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt);
+  int ret = libav().avformat_open_input(&p, src.c_str(), pInputFormat, &opt);
   clean_up_dict(opt);
 
   TORCH_CHECK(
@@ -53,7 +59,7 @@ AVFormatContext* get_input_format_context(
 
 StreamReader::StreamReader(AVFormatContext* p) : format_ctx(p) {
   C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamReader");
-  int ret = avformat_find_stream_info(format_ctx, nullptr);
+  int ret = libav().avformat_find_stream_info(format_ctx, nullptr);
   TORCH_CHECK(
       ret >= 0, "Failed to find stream information: ", av_err2string(ret));
 
@@ -110,7 +116,7 @@ void validate_src_stream_type(
       "Stream ",
       i,
       " is not ",
-      av_get_media_type_string(type),
+      libav().av_get_media_type_string(type),
       " stream.");
 }
 
@@ -125,7 +131,7 @@ namespace {
 OptionDict parse_metadata(const AVDictionary* metadata) {
   AVDictionaryEntry* tag = nullptr;
   OptionDict ret;
-  while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+  while ((tag = libav().av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
     ret.emplace(std::string(tag->key), std::string(tag->value));
   }
   return ret;
@@ -148,7 +154,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
   ret.num_frames = stream->nb_frames;
   ret.bits_per_sample = codecpar->bits_per_raw_sample;
   ret.metadata = parse_metadata(stream->metadata);
-  const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
+  const AVCodecDescriptor* desc = libav().avcodec_descriptor_get(codecpar->codec_id);
   if (desc) {
     ret.codec_name = desc->name;
     ret.codec_long_name = desc->long_name;
@@ -158,7 +164,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
     case AVMEDIA_TYPE_AUDIO: {
       AVSampleFormat smp_fmt = static_cast<AVSampleFormat>(codecpar->format);
       if (smp_fmt != AV_SAMPLE_FMT_NONE) {
-        ret.fmt_name = av_get_sample_fmt_name(smp_fmt);
+        ret.fmt_name = libav().av_get_sample_fmt_name(smp_fmt);
       }
       ret.sample_rate = static_cast<double>(codecpar->sample_rate);
       ret.num_channels = codecpar->channels;
@@ -167,7 +173,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
     case AVMEDIA_TYPE_VIDEO: {
       AVPixelFormat pix_fmt = static_cast<AVPixelFormat>(codecpar->format);
       if (pix_fmt != AV_PIX_FMT_NONE) {
-        ret.fmt_name = av_get_pix_fmt_name(pix_fmt);
+        ret.fmt_name = libav().av_get_pix_fmt_name(pix_fmt);
       }
       ret.width = codecpar->width;
       ret.height = codecpar->height;
@@ -181,7 +187,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
 
 namespace {
 AVCodecParameters* get_codecpar() {
-  AVCodecParameters* ptr = avcodec_parameters_alloc();
+  AVCodecParameters* ptr = libav().avcodec_parameters_alloc();
   TORCH_CHECK(ptr, "Failed to allocate resource.");
   return ptr;
 }
@@ -192,7 +198,7 @@ StreamParams StreamReader::get_src_stream_params(int i) {
   AVStream* stream = format_ctx->streams[i];
 
   AVCodecParametersPtr codec_params(get_codecpar());
-  int ret = avcodec_parameters_copy(codec_params, stream->codecpar);
+  int ret = libav().avcodec_parameters_copy(codec_params, stream->codecpar);
   TORCH_CHECK(
       ret >= 0,
       "Failed to copy the stream's codec parameters. (",
@@ -234,12 +240,12 @@ OutputStreamInfo StreamReader::get_out_stream_info(int i) const {
 }
 
 int64_t StreamReader::find_best_audio_stream() const {
-  return av_find_best_stream(
+  return libav().av_find_best_stream(
       format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0);
 }
 
 int64_t StreamReader::find_best_video_stream() const {
-  return av_find_best_stream(
+  return libav().av_find_best_stream(
       format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
 }
 
@@ -289,7 +295,7 @@ void StreamReader::seek(double timestamp_s, int64_t mode) {
       TORCH_CHECK(false, "Invalid mode value: ", mode);
   }
 
-  int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag);
+  int ret = libav().av_seek_frame(format_ctx, -1, timestamp_av_tb, flag);
 
   if (ret < 0) {
     seek_timestamp = 0;
@@ -402,12 +408,12 @@ void StreamReader::add_stream(
       case AVMEDIA_TYPE_AUDIO:
         return AVRational{0, 1};
       case AVMEDIA_TYPE_VIDEO:
-        return av_guess_frame_rate(format_ctx, stream, nullptr);
+        return libav().av_guess_frame_rate(format_ctx, stream, nullptr);
       default:
         TORCH_INTERNAL_ASSERT(
             false,
             "Unexpected media type is given: ",
-            av_get_media_type_string(media_type));
+            libav().av_get_media_type_string(media_type));
     }
   }();
   int key = processors[i]->add_stream(
@@ -446,7 +452,7 @@ void StreamReader::remove_stream(int64_t i) {
 // 1: It's done, caller should stop calling
 // <0: Some error happened
 int StreamReader::process_packet() {
-  int ret = av_read_frame(format_ctx, packet);
+  int ret = libav().av_read_frame(format_ctx, packet);
   if (ret == AVERROR_EOF) {
     ret = drain();
     return (ret < 0) ? ret : 1;
@@ -577,12 +583,12 @@ AVIOContext* get_io_context(
     int buffer_size,
     int (*read_packet)(void* opaque, uint8_t* buf, int buf_size),
     int64_t (*seek)(void* opaque, int64_t offset, int whence)) {
-  unsigned char* buffer = static_cast<unsigned char*>(av_malloc(buffer_size));
+  unsigned char* buffer = static_cast<unsigned char*>(libav().av_malloc(buffer_size));
   TORCH_CHECK(buffer, "Failed to allocate buffer.");
-  AVIOContext* io_ctx = avio_alloc_context(
+  AVIOContext* io_ctx = libav().avio_alloc_context(
       buffer, buffer_size, 0, opaque, read_packet, nullptr, seek);
   if (!io_ctx) {
-    av_freep(&buffer);
+    libav().av_freep(&buffer);
     TORCH_CHECK(false, "Failed to allocate AVIOContext.");
   }
   return io_ctx;
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
index a0e18fb8d41..7269dfc4fc6 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
@@ -1,9 +1,16 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/encode_process.h>
 #include <cmath>
 
+extern "C" {
+#include <libavutil/rational.h>
+}
+
 namespace torchaudio::io {
 
+using detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // EncodeProcess Logic Implementation
 ////////////////////////////////////////////////////////////////////////////////
@@ -56,7 +63,7 @@ void EncodeProcess::process_frame(AVFrame* src) {
     if (ret >= 0) {
       encoder.encode(dst_frame);
     }
-    av_frame_unref(dst_frame);
+    libav().av_frame_unref(dst_frame);
   }
 }
 
@@ -71,8 +78,8 @@ void EncodeProcess::flush() {
 namespace {
 
 enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
-  auto fmt = av_get_sample_fmt(src.c_str());
-  if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) {
+  auto fmt = libav().av_get_sample_fmt(src.c_str());
+  if (fmt != AV_SAMPLE_FMT_NONE && !libav().av_sample_fmt_is_planar(fmt)) {
     return fmt;
   }
   TORCH_CHECK(
@@ -89,7 +96,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
               AV_SAMPLE_FMT_S64,
               AV_SAMPLE_FMT_FLT,
               AV_SAMPLE_FMT_DBL}) {
-          ret.emplace_back(av_get_sample_fmt_name(fmt));
+          ret.emplace_back(libav().av_get_sample_fmt_name(fmt));
         }
         return c10::Join(", ", ret);
       }(),
@@ -97,7 +104,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
 }
 
 enum AVPixelFormat get_src_pix_fmt(const std::string& src) {
-  AVPixelFormat fmt = av_get_pix_fmt(src.c_str());
+  AVPixelFormat fmt = libav().av_get_pix_fmt(src.c_str());
   switch (fmt) {
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_RGB24:
@@ -118,7 +125,7 @@ enum AVPixelFormat get_src_pix_fmt(const std::string& src) {
               AV_PIX_FMT_RGB24,
               AV_PIX_FMT_BGR24,
               AV_PIX_FMT_YUV444P}) {
-          ret.emplace_back(av_get_pix_fmt_name(fmt));
+          ret.emplace_back(libav().av_get_pix_fmt_name(fmt));
         }
         return c10::Join(", ", ret);
       }(),
@@ -132,18 +139,18 @@ const AVCodec* get_codec(
     AVCodecID default_codec,
     const c10::optional<std::string>& encoder) {
   if (encoder) {
-    const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str());
+    const AVCodec* c = libav().avcodec_find_encoder_by_name(encoder.value().c_str());
     TORCH_CHECK(c, "Unexpected codec: ", encoder.value());
     return c;
   }
-  const AVCodec* c = avcodec_find_encoder(default_codec);
+  const AVCodec* c = libav().avcodec_find_encoder(default_codec);
   TORCH_CHECK(
-      c, "Encoder not found for codec: ", avcodec_get_name(default_codec));
+      c, "Encoder not found for codec: ", libav().avcodec_get_name(default_codec));
   return c;
 }
 
 AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) {
-  AVCodecContext* ctx = avcodec_alloc_context3(codec);
+  AVCodecContext* ctx = libav().avcodec_alloc_context3(codec);
   TORCH_CHECK(ctx, "Failed to allocate CodecContext.");
 
   if (flags & AVFMT_GLOBALHEADER) {
@@ -169,25 +176,25 @@ void open_codec(
   // while "libopus" refers to the one depends on libopusenc
   // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251
   if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) {
-    if (!av_dict_get(opt, "strict", nullptr, 0)) {
+    if (!libav().av_dict_get(opt, "strict", nullptr, 0)) {
       TORCH_WARN_ONCE(
           "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ",
           "If this is not desired, please provide \"strict\" encoder option ",
           "with desired value.");
-      av_dict_set(&opt, "strict", "experimental", 0);
+      libav().av_dict_set(&opt, "strict", "experimental", 0);
     }
   }
   if (std::strcmp(codec_ctx->codec->name, "opus") == 0) {
-    if (!av_dict_get(opt, "strict", nullptr, 0)) {
+    if (!libav().av_dict_get(opt, "strict", nullptr, 0)) {
       TORCH_WARN_ONCE(
           "\"opus\" encoder is selected. Enabling '-strict experimental'. ",
           "If this is not desired, please provide \"strict\" encoder option ",
           "with desired value.");
-      av_dict_set(&opt, "strict", "experimental", 0);
+      libav().av_dict_set(&opt, "strict", "experimental", 0);
     }
   }
 
-  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt);
+  int ret = libav().avcodec_open2(codec_ctx, codec_ctx->codec, &opt);
   clean_up_dict(opt);
   TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")");
 }
@@ -214,7 +221,7 @@ bool supported_sample_fmt(
 std::string get_supported_formats(const AVSampleFormat* sample_fmts) {
   std::vector<std::string> ret;
   while (*sample_fmts != AV_SAMPLE_FMT_NONE) {
-    ret.emplace_back(av_get_sample_fmt_name(*sample_fmts));
+    ret.emplace_back(libav().av_get_sample_fmt_name(*sample_fmts));
     ++sample_fmts;
   }
   return c10::Join(", ", ret);
@@ -226,7 +233,7 @@ AVSampleFormat get_enc_fmt(
     const AVCodec* codec) {
   if (encoder_format) {
     auto& enc_fmt_val = encoder_format.value();
-    auto fmt = av_get_sample_fmt(enc_fmt_val.c_str());
+    auto fmt = libav().av_get_sample_fmt(enc_fmt_val.c_str());
     TORCH_CHECK(
         fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val);
     TORCH_CHECK(
@@ -313,8 +320,8 @@ std::string get_supported_channels(const uint64_t* channel_layouts) {
   std::vector<std::string> names;
   while (*channel_layouts) {
     std::stringstream ss;
-    ss << av_get_channel_layout_nb_channels(*channel_layouts);
-    ss << " (" << av_get_channel_name(*channel_layouts) << ")";
+    ss << libav().av_get_channel_layout_nb_channels(*channel_layouts);
+    ss << " (" << libav().av_get_channel_name(*channel_layouts) << ")";
     names.emplace_back(ss.str());
     ++channel_layouts;
   }
@@ -331,10 +338,10 @@ uint64_t get_channel_layout(
     TORCH_CHECK(
         val > 0, "The number of channels must be greater than 0. Found: ", val);
     if (!codec->channel_layouts) {
-      return static_cast<uint64_t>(av_get_default_channel_layout(val));
+      return static_cast<uint64_t>(libav().av_get_default_channel_layout(val));
     }
     for (const uint64_t* it = codec->channel_layouts; *it; ++it) {
-      if (av_get_channel_layout_nb_channels(*it) == val) {
+      if (libav().av_get_channel_layout_nb_channels(*it) == val) {
         return *it;
       }
     }
@@ -371,8 +378,8 @@ void configure_audio_codec_ctx(
     const c10::optional<CodecConfig>& codec_config) {
   codec_ctx->sample_fmt = format;
   codec_ctx->sample_rate = sample_rate;
-  codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24));
-  codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout);
+  codec_ctx->time_base = av_inv_q(libav().av_d2q(sample_rate, 1 << 24));
+  codec_ctx->channels = libav().av_get_channel_layout_nb_channels(channel_layout);
   codec_ctx->channel_layout = channel_layout;
 
   // Set optional stuff
@@ -411,7 +418,7 @@ bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) {
 std::string get_supported_formats(const AVPixelFormat* pix_fmts) {
   std::vector<std::string> ret;
   while (*pix_fmts != AV_PIX_FMT_NONE) {
-    ret.emplace_back(av_get_pix_fmt_name(*pix_fmts));
+    ret.emplace_back(libav().av_get_pix_fmt_name(*pix_fmts));
     ++pix_fmts;
   }
   return c10::Join(", ", ret);
@@ -423,7 +430,7 @@ AVPixelFormat get_enc_fmt(
     const AVCodec* codec) {
   if (encoder_format) {
     const auto& val = encoder_format.value();
-    auto fmt = av_get_pix_fmt(val.c_str());
+    auto fmt = libav().av_get_pix_fmt(val.c_str());
     TORCH_CHECK(
         supported_pix_fmt(fmt, codec->pix_fmts),
         codec->name,
@@ -461,7 +468,7 @@ AVRational get_enc_rate(
         std::isfinite(enc_rate) && enc_rate > 0,
         "Encoder sample rate must be positive and fininte. Found: ",
         enc_rate);
-    AVRational rate = av_d2q(enc_rate, 1 << 24);
+    AVRational rate = libav().av_d2q(enc_rate, 1 << 24);
     TORCH_CHECK(
         supported_frame_rate(rate, codec->supported_framerates),
         codec->name,
@@ -545,14 +552,14 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
   // context to AVCodecContext. But this way, it will be deallocated
   // automatically at the time AVCodecContext is freed, so we do that.
 
-  ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index()));
+  ctx->hw_device_ctx = libav().av_buffer_ref(get_cuda_context(device.index()));
   TORCH_INTERNAL_ASSERT(
       ctx->hw_device_ctx, "Failed to reference HW device context.");
 
   ctx->sw_pix_fmt = ctx->pix_fmt;
   ctx->pix_fmt = AV_PIX_FMT_CUDA;
 
-  ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx);
+  ctx->hw_frames_ctx = libav().av_hwframe_ctx_alloc(ctx->hw_device_ctx);
   TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context.");
 
   auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data);
@@ -562,7 +569,7 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
   frames_ctx->height = ctx->height;
   frames_ctx->initial_pool_size = 5;
 
-  int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx);
+  int ret = libav().av_hwframe_ctx_init(ctx->hw_frames_ctx);
   TORCH_CHECK(
       ret >= 0,
       "Failed to initialize CUDA frame context: ",
@@ -574,11 +581,11 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
 ////////////////////////////////////////////////////////////////////////////////
 
 AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) {
-  AVStream* stream = avformat_new_stream(format_ctx, nullptr);
+  AVStream* stream = libav().avformat_new_stream(format_ctx, nullptr);
   TORCH_CHECK(stream, "Failed to allocate stream.");
 
   stream->time_base = codec_ctx->time_base;
-  int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx);
+  int ret = libav().avcodec_parameters_from_context(stream->codecpar, codec_ctx);
   TORCH_CHECK(
       ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret));
   return stream;
@@ -605,7 +612,7 @@ FilterGraph get_audio_filter_graph(
     if (filter_desc || src_fmt != enc_fmt ||
         src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) {
       std::stringstream ss;
-      ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt)
+      ss << "aformat=sample_fmts=" << libav().av_get_sample_fmt_name(enc_fmt)
          << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x"
          << std::hex << enc_ch_layout;
       parts.push_back(ss.str());
@@ -656,7 +663,7 @@ FilterGraph get_video_filter_graph(
     }
     if (filter_desc || src_fmt != enc_fmt) {
       std::stringstream ss;
-      ss << "format=" << av_get_pix_fmt_name(enc_fmt);
+      ss << "format=" << libav().av_get_pix_fmt_name(enc_fmt);
       parts.emplace_back(ss.str());
     }
     if (filter_desc ||
@@ -695,7 +702,7 @@ AVFramePtr get_audio_frame(
   frame->channel_layout = channel_layout;
   frame->sample_rate = sample_rate;
   frame->nb_samples = nb_samples;
-  int ret = av_frame_get_buffer(frame, 0);
+  int ret = libav().av_frame_get_buffer(frame, 0);
   TORCH_CHECK(
       ret >= 0, "Error allocating the source audio frame:", av_err2string(ret));
 
@@ -711,7 +718,7 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) {
   frame->format = src_fmt;
   frame->width = width;
   frame->height = height;
-  int ret = av_frame_get_buffer(frame, 0);
+  int ret = libav().av_frame_get_buffer(frame, 0);
   TORCH_CHECK(
       ret >= 0, "Error allocating a video buffer :", av_err2string(ret));
 
@@ -756,10 +763,10 @@ EncodeProcess get_audio_encode_process(
   // case, restrictions on the format to support tensor inputs do not apply, and
   // so we directly get the format via FFmpeg.
   const AVSampleFormat src_fmt = (disable_converter)
-      ? av_get_sample_fmt(format.c_str())
+      ? libav().av_get_sample_fmt(format.c_str())
       : get_src_sample_fmt(format);
   const auto src_ch_layout =
-      static_cast<uint64_t>(av_get_default_channel_layout(src_num_channels));
+      static_cast<uint64_t>(libav().av_get_default_channel_layout(src_num_channels));
 
   // 2. Fetch codec from default or override
   TORCH_CHECK(
@@ -779,7 +786,7 @@ EncodeProcess get_audio_encode_process(
       // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277
       // This is the case for at least until FFmpeg 6.0, so it will be
       // like this for a while.
-      return static_cast<uint64_t>(av_get_default_channel_layout(2));
+      return static_cast<uint64_t>(libav().av_get_default_channel_layout(2));
     }
     return get_channel_layout(src_ch_layout, encoder_num_channels, codec);
   }();
@@ -867,9 +874,9 @@ EncodeProcess get_video_encode_process(
   // case, restrictions on the format to support tensor inputs do not apply, and
   // so we directly get the format via FFmpeg.
   const AVPixelFormat src_fmt = (disable_converter)
-      ? av_get_pix_fmt(format.c_str())
+      ? libav().av_get_pix_fmt(format.c_str())
       : get_src_pix_fmt(format);
-  const AVRational src_rate = av_d2q(frame_rate, 1 << 24);
+  const AVRational src_rate = libav().av_d2q(frame_rate, 1 << 24);
 
   // 2. Fetch codec from default or override
   TORCH_CHECK(
@@ -936,7 +943,7 @@ EncodeProcess get_video_encode_process(
   AVFramePtr src_frame = [&]() {
     if (codec_ctx->hw_frames_ctx) {
       AVFramePtr frame{alloc_avframe()};
-      int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
+      int ret = libav().av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
       TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
       frame->nb_samples = 1;
       frame->pts = 0;
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
index 3d2e5015357..038ae47ce36 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
@@ -1,7 +1,10 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/encoder.h>
 
 namespace torchaudio::io {
 
+using detail::libav;
+
 Encoder::Encoder(
     AVFormatContext* format_ctx,
     AVCodecContext* codec_ctx,
@@ -13,10 +16,10 @@ Encoder::Encoder(
 ///
 /// @param frame Frame data to encode
 void Encoder::encode(AVFrame* frame) {
-  int ret = avcodec_send_frame(codec_ctx, frame);
+  int ret = libav().avcodec_send_frame(codec_ctx, frame);
   TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ").");
   while (ret >= 0) {
-    ret = avcodec_receive_packet(codec_ctx, packet);
+    ret = libav().avcodec_receive_packet(codec_ctx, packet);
     if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
       if (ret == AVERROR_EOF) {
         // Note:
@@ -31,7 +34,7 @@ void Encoder::encode(AVFrame* frame) {
         // An alternative is to use `av_write_frame` functoin, but in that case
         // client code is responsible for ordering packets, which makes it
         // complicated to use StreamWriter
-        ret = av_interleaved_write_frame(format_ctx, nullptr);
+        ret = libav().av_interleaved_write_frame(format_ctx, nullptr);
         TORCH_CHECK(
             ret >= 0, "Failed to flush packet (", av_err2string(ret), ").");
       }
@@ -51,10 +54,10 @@ void Encoder::encode(AVFrame* frame) {
       // This has to be set before av_packet_rescale_ts bellow.
       packet->duration = 1;
     }
-    av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base);
+    libav().av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base);
     packet->stream_index = stream->index;
 
-    ret = av_interleaved_write_frame(format_ctx, packet);
+    ret = libav().av_interleaved_write_frame(format_ctx, packet);
     TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ").");
   }
 }
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
index 0701c5a5965..773081987a8 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
@@ -1,13 +1,17 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/packet_writer.h>
 
 namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {
 AVStream* add_stream(
     AVFormatContext* format_ctx,
     const StreamParams& stream_params) {
-  AVStream* stream = avformat_new_stream(format_ctx, nullptr);
+  AVStream* stream = libav().avformat_new_stream(format_ctx, nullptr);
   int ret =
-      avcodec_parameters_copy(stream->codecpar, stream_params.codec_params);
+      libav().avcodec_parameters_copy(stream->codecpar, stream_params.codec_params);
   TORCH_CHECK(
       ret >= 0,
       "Failed to copy the stream's codec parameters. (",
@@ -26,11 +30,11 @@ PacketWriter::PacketWriter(
 
 void PacketWriter::write_packet(const AVPacketPtr& packet) {
   AVPacket dst_packet;
-  int ret = av_packet_ref(&dst_packet, packet);
+  int ret = libav().av_packet_ref(&dst_packet, packet);
   TORCH_CHECK(ret >= 0, "Failed to copy packet.");
-  av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base);
+  libav().av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base);
   dst_packet.stream_index = stream->index;
-  ret = av_interleaved_write_frame(format_ctx, &dst_packet);
+  ret = libav().av_interleaved_write_frame(format_ctx, &dst_packet);
   TORCH_CHECK(ret >= 0, "Failed to write packet to destination.");
 }
 } // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
index df51d92355c..7aacd013994 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
@@ -1,11 +1,14 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
 
 #ifdef USE_CUDA
 #include <c10/cuda/CUDAStream.h>
 #endif
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {
 
 AVFormatContext* get_output_format_context(
@@ -19,7 +22,7 @@ AVFormatContext* get_output_format_context(
   }
 
   AVFormatContext* p = nullptr;
-  int ret = avformat_alloc_output_context2(
+  int ret = libav().avformat_alloc_output_context2(
       &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str());
   TORCH_CHECK(
       ret >= 0,
@@ -208,14 +211,14 @@ void StreamWriter::add_video_frame_stream(
 }
 
 void StreamWriter::set_metadata(const OptionDict& metadata) {
-  av_dict_free(&format_ctx->metadata);
+  libav().av_dict_free(&format_ctx->metadata);
   for (auto const& [key, value] : metadata) {
-    av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0);
+    libav().av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0);
   }
 }
 
 void StreamWriter::dump_format(int64_t i) {
-  av_dump_format(format_ctx, (int)i, format_ctx->url, 1);
+  libav().av_dump_format(format_ctx, (int)i, format_ctx->url, 1);
 }
 
 void StreamWriter::open(const c10::optional<OptionDict>& option) {
@@ -231,10 +234,10 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
   AVDictionary* opt = get_option_dict(option);
   if (!(fmt->flags & AVFMT_NOFILE) &&
       !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
-    ret = avio_open2(
+    ret = libav().avio_open2(
         &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt);
     if (ret < 0) {
-      av_dict_free(&opt);
+      libav().av_dict_free(&opt);
       TORCH_CHECK(
           false,
           "Failed to open dst: ",
@@ -245,7 +248,7 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
     }
   }
 
-  ret = avformat_write_header(format_ctx, &opt);
+  ret = libav().avformat_write_header(format_ctx, &opt);
   clean_up_dict(opt);
   TORCH_CHECK(
       ret >= 0,
@@ -258,7 +261,7 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
 }
 
 void StreamWriter::close() {
-  int ret = av_write_trailer(format_ctx);
+  int ret = libav().av_write_trailer(format_ctx);
   if (ret < 0) {
     LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ").";
   }
@@ -269,7 +272,7 @@ void StreamWriter::close() {
   if (!(fmt->flags & AVFMT_NOFILE) &&
       !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
     // avio_closep can be only applied to AVIOContext opened by avio_open
-    avio_closep(&(format_ctx->pb));
+    libav().avio_closep(&(format_ctx->pb));
   }
   is_open = false;
 }
@@ -355,12 +358,12 @@ AVIOContext* get_io_context(
     int buffer_size,
     int (*write_packet)(void* opaque, uint8_t* buf, int buf_size),
     int64_t (*seek)(void* opaque, int64_t offset, int whence)) {
-  unsigned char* buffer = static_cast<unsigned char*>(av_malloc(buffer_size));
+  unsigned char* buffer = static_cast<unsigned char*>(libav().av_malloc(buffer_size));
   TORCH_CHECK(buffer, "Failed to allocate buffer.");
-  AVIOContext* io_ctx = avio_alloc_context(
+  AVIOContext* io_ctx = libav().avio_alloc_context(
       buffer, buffer_size, 1, opaque, nullptr, write_packet, seek);
   if (!io_ctx) {
-    av_freep(&buffer);
+    libav().av_freep(&buffer);
     TORCH_CHECK(false, "Failed to allocate AVIOContext.");
   }
   return io_ctx;
@@ -384,5 +387,4 @@ StreamWriterCustomIO::StreamWriterCustomIO(
     : CustomOutput(opaque, buffer_size, write_packet, seek),
       StreamWriter(io_ctx, format) {}
 
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
index e9350f0479a..00ae55a6b77 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
@@ -1,3 +1,4 @@
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.h>
 
 #ifdef USE_CUDA
@@ -6,6 +7,8 @@
 
 namespace torchaudio::io {
 
+using detail::libav;
+
 namespace {
 
 using InitFunc = TensorConverter::InitFunc;
@@ -41,8 +44,8 @@ void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels);
 
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334
-  if (!av_frame_is_writable(buffer)) {
-    int ret = av_frame_make_writable(buffer);
+  if (!libav().av_frame_is_writable(buffer)) {
+    int ret = libav().av_frame_make_writable(buffer);
     TORCH_INTERNAL_ASSERT(
         ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
   }
@@ -145,8 +148,8 @@ void write_interlaced_video(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels);
 
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  if (!av_frame_is_writable(buffer)) {
-    int ret = av_frame_make_writable(buffer);
+  if (!libav().av_frame_is_writable(buffer)) {
+    int ret = libav().av_frame_make_writable(buffer);
     TORCH_INTERNAL_ASSERT(
         ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
   }
@@ -187,7 +190,7 @@ void write_planar_video(
     AVFrame* buffer,
     int num_planes) {
   const auto num_colors =
-      av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components;
+      libav().av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components;
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors);
@@ -195,8 +198,8 @@ void write_planar_video(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width);
 
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  if (!av_frame_is_writable(buffer)) {
-    int ret = av_frame_make_writable(buffer);
+  if (!libav().av_frame_is_writable(buffer)) {
+    int ret = libav().av_frame_make_writable(buffer);
     TORCH_INTERNAL_ASSERT(
         ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
   }
@@ -308,7 +311,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
         TORCH_CHECK(
             false,
             "Unexpected pixel format for CUDA: ",
-            av_get_pix_fmt_name(sw_pix_fmt));
+            libav().av_get_pix_fmt_name(sw_pix_fmt));
     }
   }
 
@@ -317,7 +320,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_BGR24: {
-      int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components;
+      int channels = libav().av_pix_fmt_desc_get(pix_fmt)->nb_components;
       InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) {
         validate_video_input(t, f, channels);
         return init_interlaced(t);
@@ -339,7 +342,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
     }
     default:
       TORCH_CHECK(
-          false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt));
+          false, "Unexpected pixel format: ", libav().av_get_pix_fmt_name(pix_fmt));
   }
 }
 
@@ -383,7 +386,7 @@ TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size)
       break;
     default:
       TORCH_INTERNAL_ASSERT(
-          false, "Unsupported media type: ", av_get_media_type_string(type));
+          false, "Unsupported media type: ", libav().av_get_media_type_string(type));
   }
 }