diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml index 96b66319ed6..91275ff31bd 100644 --- a/.circleci/unittest/linux/scripts/environment.yml +++ b/.circleci/unittest/linux/scripts/environment.yml @@ -1,4 +1,5 @@ channels: + - pytorch - defaults dependencies: - numpy @@ -8,6 +9,7 @@ dependencies: - pip - libpng - jpeg + - ffmpeg=4.2 - ca-certificates - pip: - future diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml index b250da62494..9f4348ebb26 100644 --- a/.circleci/unittest/windows/scripts/environment.yml +++ b/.circleci/unittest/windows/scripts/environment.yml @@ -1,4 +1,5 @@ channels: + - pytorch - defaults dependencies: - numpy diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh index 043d2ed7ea9..60adf38f901 100755 --- a/packaging/build_wheel.sh +++ b/packaging/build_wheel.sh @@ -32,6 +32,8 @@ else cp "/usr/lib64/libjpeg.so" torchvision fi +download_copy_ffmpeg + if [[ "$OSTYPE" == "msys" ]]; then IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel else diff --git a/packaging/conda/build_vision.sh b/packaging/conda/build_vision.sh index 200b99fb22c..79a40cb2cb5 100755 --- a/packaging/conda/build_vision.sh +++ b/packaging/conda/build_vision.sh @@ -127,7 +127,7 @@ else fi if [[ -z "$PYTORCH_VERSION" ]]; then - export CONDA_CHANNEL_FLAGS="-c pytorch-nightly" + export CONDA_CHANNEL_FLAGS="-c pytorch-nightly -c pytorch" export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \ python -c "import os, sys, json, re; cuver = '$cuver'; \ cuver = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \ diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash index 9ee70af1c52..da77fe57047 100644 --- a/packaging/pkg_helpers.bash +++ b/packaging/pkg_helpers.bash @@ -240,7 +240,7 @@ setup_pip_pytorch_version() { # You MUST have populated PYTORCH_VERSION_SUFFIX before hand. setup_conda_pytorch_constraint() { if [[ -z "$PYTORCH_VERSION" ]]; then - export CONDA_CHANNEL_FLAGS="-c pytorch-nightly" + export CONDA_CHANNEL_FLAGS="-c pytorch-nightly -c pytorch" export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \ python -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \ cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \ @@ -350,3 +350,39 @@ setup_junit_results_folder() { export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml" fi } + + +download_copy_ffmpeg() { + mkdir ffmpeg_tmp + cd ffmpeg_tmp + if [[ "$OSTYPE" == "msys" ]]; then + # conda install -yq ffmpeg -c pytorch + # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2 + # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=- + # cp Library/bin/*.dll ../torchvision + echo "FFmpeg is disabled currently on Windows" + else + if [[ "$(uname)" == Darwin ]]; then + conda install -yq ffmpeg=4.2 -c pytorch + conda install -yq wget + wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/osx-64/ffmpeg-4.2-h0a44026_0.tar.bz2 + tar -xjvf ffmpeg-4.2-h0a44026_0.tar.bz2 + for f in lib/*.dylib; do + if [[ $f =~ ([a-z])+\.dylib ]]; then + cp $f ../torchvision + fi + done + else + wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2 + tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2 + cp lib/*.so ../torchvision + cp -r lib/* /usr/lib + cp -r bin/* /usr/bin + cp -r include/* /usr/include + ldconfig + which ffmpeg + fi + fi + cd .. + rm -rf ffmpeg_tmp +} diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml index 5188bb0ebec..257515c8b70 100644 --- a/packaging/torchvision/conda_build_config.yaml +++ b/packaging/torchvision/conda_build_config.yaml @@ -1,3 +1,5 @@ +channel_sources: + - pytorch-nightly,pytorch,defaults blas_impl: - mkl # [x86_64] c_compiler: diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml index 55d1e0b4d70..fadd9b47f72 100644 --- a/packaging/torchvision/meta.yaml +++ b/packaging/torchvision/meta.yaml @@ -10,6 +10,7 @@ requirements: - {{ compiler('c') }} # [win] - libpng - jpeg + - ffmpeg =4.2 # [not win] host: - python @@ -21,6 +22,7 @@ requirements: run: - python - libpng + - ffmpeg =4.2 # [not win] - jpeg - pillow >=4.1.1 - numpy >=1.11 @@ -48,7 +50,7 @@ test: requires: - pytest - scipy - - av + - av =8.0.1 - ca-certificates {{ environ.get('CONDA_TYPING_CONSTRAINT') }} diff --git a/setup.py b/setup.py index 2f87b070170..4e927923fcb 100644 --- a/setup.py +++ b/setup.py @@ -337,7 +337,9 @@ def get_extensions(): ffmpeg_bin = os.path.dirname(ffmpeg_exe) ffmpeg_root = os.path.dirname(ffmpeg_bin) ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include') + ffmpeg_library_dir = os.path.join(ffmpeg_root, 'lib') print("ffmpeg include path: {}".format(ffmpeg_include_dir)) + print("ffmpeg library_dir: {}".format(ffmpeg_library_dir)) # TorchVision base decoder + video reader video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader') @@ -360,7 +362,7 @@ def get_extensions(): ffmpeg_include_dir, extensions_dir, ], - library_dirs=library_dirs, + library_dirs=[ffmpeg_library_dir] + library_dirs, libraries=[ 'avcodec', 'avformat', @@ -368,8 +370,8 @@ def get_extensions(): 'swresample', 'swscale', ], - extra_compile_args=["-std=c++14"], - extra_link_args=["-std=c++14"], + extra_compile_args=["-std=c++14"] if os.name != 'nt' else ['/std:c++14', '/MP'], + extra_link_args=["-std=c++14" if os.name != 'nt' else '/std:c++14'], ) ) diff --git a/test/test_datasets_video_utils_opt.py b/test/test_datasets_video_utils_opt.py index f94af400838..8075c701ed9 100644 --- a/test/test_datasets_video_utils_opt.py +++ b/test/test_datasets_video_utils_opt.py @@ -2,8 +2,8 @@ from torchvision import set_video_backend import test_datasets_video_utils - -set_video_backend('video_reader') +# Disabling the video backend switching temporarily +# set_video_backend('video_reader') if __name__ == '__main__': diff --git a/test/test_io_opt.py b/test/test_io_opt.py index 1ad3dea8fa2..87698b34624 100644 --- a/test/test_io_opt.py +++ b/test/test_io_opt.py @@ -3,7 +3,8 @@ import test_io -set_video_backend('video_reader') +# Disabling the video backend switching temporarily +# set_video_backend('video_reader') if __name__ == '__main__': diff --git a/torchvision/csrc/cpu/decoder/decoder.h b/torchvision/csrc/cpu/decoder/decoder.h index 69b69721226..3fcd3ae316f 100644 --- a/torchvision/csrc/cpu/decoder/decoder.h +++ b/torchvision/csrc/cpu/decoder/decoder.h @@ -5,6 +5,11 @@ #include "seekable_buffer.h" #include "stream.h" +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#endif + namespace ffmpeg { /** diff --git a/torchvision/csrc/cpu/decoder/stream.cpp b/torchvision/csrc/cpu/decoder/stream.cpp index ec508639e7a..4da48647382 100644 --- a/torchvision/csrc/cpu/decoder/stream.cpp +++ b/torchvision/csrc/cpu/decoder/stream.cpp @@ -3,6 +3,7 @@ #include "util.h" namespace ffmpeg { +const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; Stream::Stream( AVFormatContext* inputCtx, @@ -85,7 +86,7 @@ int Stream::openCodec(std::vector* metadata) { header.num = steam->time_base.num; header.den = steam->time_base.den; header.duration = - av_rescale_q(steam->duration, steam->time_base, AV_TIME_BASE_Q); + av_rescale_q(steam->duration, steam->time_base, timeBaseQ); metadata->push_back(header); } @@ -238,7 +239,7 @@ void Stream::setFramePts(DecoderHeader* header, bool flush) { header->pts = av_rescale_q( header->pts, inputCtx_->streams[format_.stream]->time_base, - AV_TIME_BASE_Q); + timeBaseQ); } switch (format_.type) { diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp index 87906e78fe4..0d3fc9f12c1 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp +++ b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp @@ -4,6 +4,7 @@ #include "util.h" namespace ffmpeg { +const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; SubtitleStream::SubtitleStream( AVFormatContext* inputCtx, @@ -65,7 +66,7 @@ int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) { // set proper pts in us if (gotFramePtr) { sub_.pts = av_rescale_q( - pkt.pts, inputCtx_->streams[format_.stream]->time_base, AV_TIME_BASE_Q); + pkt.pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ); } return result; diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.cpp b/torchvision/csrc/cpu/video_reader/VideoReader.cpp index 3a184716b4d..e50dc554956 100644 --- a/torchvision/csrc/cpu/video_reader/VideoReader.cpp +++ b/torchvision/csrc/cpu/video_reader/VideoReader.cpp @@ -29,6 +29,7 @@ namespace video_reader { const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT; +const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; const size_t decoderTimeoutMs = 600000; // A jitter can be added to the end of the range to avoid conversion/rounding // error, small value 100us won't be enough to select the next frame, but enough @@ -99,8 +100,8 @@ size_t fillTensor( for (size_t i = 0; i < msgs.size(); ++i) { const auto& msg = msgs[i]; // convert pts into original time_base - AVRational avr = {(int)num, (int)den}; - framePtsData[i] = av_rescale_q(msg.header.pts, AV_TIME_BASE_Q, avr); + AVRational avr = AVRational{(int)num, (int)den}; + framePtsData[i] = av_rescale_q(msg.header.pts, timeBaseQ, avr); VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts << ", original: " << framePtsData[i]; @@ -156,28 +157,26 @@ void offsetsToUs( videoEndUs = -1; if (readVideoStream) { - AVRational vr = {(int)videoTimeBaseNum, (int)videoTimeBaseDen}; + AVRational vr = AVRational{(int)videoTimeBaseNum, (int)videoTimeBaseDen}; if (videoStartPts > 0) { - videoStartUs = av_rescale_q(videoStartPts, vr, AV_TIME_BASE_Q); + videoStartUs = av_rescale_q(videoStartPts, vr, timeBaseQ); } if (videoEndPts > 0) { // Add jitter to the end of the range to avoid conversion/rounding error. // Small value 100us won't be enough to select the next frame, but enough // to compensate rounding error due to the multiple conversions. - videoEndUs = - timeBaseJitterUs + av_rescale_q(videoEndPts, vr, AV_TIME_BASE_Q); + videoEndUs = timeBaseJitterUs + av_rescale_q(videoEndPts, vr, timeBaseQ); } } else if (readAudioStream) { - AVRational ar = {(int)audioTimeBaseNum, (int)audioTimeBaseDen}; + AVRational ar = AVRational{(int)audioTimeBaseNum, (int)audioTimeBaseDen}; if (audioStartPts > 0) { - videoStartUs = av_rescale_q(audioStartPts, ar, AV_TIME_BASE_Q); + videoStartUs = av_rescale_q(audioStartPts, ar, timeBaseQ); } if (audioEndPts > 0) { // Add jitter to the end of the range to avoid conversion/rounding error. // Small value 100us won't be enough to select the next frame, but enough // to compensate rounding error due to the multiple conversions. - videoEndUs = - timeBaseJitterUs + av_rescale_q(audioEndPts, ar, AV_TIME_BASE_Q); + videoEndUs = timeBaseJitterUs + av_rescale_q(audioEndPts, ar, timeBaseQ); } } } @@ -336,8 +335,8 @@ torch::List readVideo( videoDuration = torch::zeros({1}, torch::kLong); int64_t* videoDurationData = videoDuration.data_ptr(); - AVRational vr = {(int)header.num, (int)header.den}; - videoDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, vr); + AVRational vr = AVRational{(int)header.num, (int)header.den}; + videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, vr); VLOG(1) << "Video decoding from " << logType << " [" << logMessage << "] filled video tensors"; } else { @@ -398,8 +397,8 @@ torch::List readVideo( audioDuration = torch::zeros({1}, torch::kLong); int64_t* audioDurationData = audioDuration.data_ptr(); - AVRational ar = {(int)header.num, (int)header.den}; - audioDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, ar); + AVRational ar = AVRational{(int)header.num, (int)header.den}; + audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, ar); VLOG(1) << "Video decoding from " << logType << " [" << logMessage << "] filled audio tensors"; } else { @@ -598,8 +597,8 @@ torch::List probeVideo( videoDuration = torch::zeros({1}, torch::kLong); int64_t* videoDurationData = videoDuration.data_ptr(); - AVRational avr = {(int)header.num, (int)header.den}; - videoDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, avr); + AVRational avr = AVRational{(int)header.num, (int)header.den}; + videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr); VLOG(2) << "Prob fps: " << header.fps << ", duration: " << header.duration << ", num: " << header.num << ", den: " << header.den; @@ -631,8 +630,8 @@ torch::List probeVideo( audioDuration = torch::zeros({1}, torch::kLong); int64_t* audioDurationData = audioDuration.data_ptr(); - AVRational avr = {(int)header.num, (int)header.den}; - audioDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, avr); + AVRational avr = AVRational{(int)header.num, (int)header.den}; + audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr); VLOG(2) << "Prob sample rate: " << format.samples << ", duration: " << header.duration << ", num: " << header.num diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py index 17e1de59bec..ae4b0f7c869 100644 --- a/torchvision/io/_video_opt.py +++ b/torchvision/io/_video_opt.py @@ -88,7 +88,7 @@ def _validate_pts(pts_range): assert ( pts_range[0] <= pts_range[1] ), """Start pts should not be smaller than end pts, got - start pts: %d and end pts: %d""" % ( + start pts: {0:d} and end pts: {1:d}""".format( pts_range[0], pts_range[1], )