From a1ad68bb3090bf6dd1a236ef3daaf432f7524dbc Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 13 Aug 2020 13:35:26 -0500 Subject: [PATCH 001/128] adding base files --- torchvision/csrc/cpu/video/Video.cpp | 0 torchvision/csrc/cpu/video/Video.h | 42 ++++++++++++++++++++++++++++ video_reader.todo | 15 ++++++++++ 3 files changed, 57 insertions(+) create mode 100644 torchvision/csrc/cpu/video/Video.cpp create mode 100644 torchvision/csrc/cpu/video/Video.h create mode 100644 video_reader.todo diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h new file mode 100644 index 00000000000..13d05a90128 --- /dev/null +++ b/torchvision/csrc/cpu/video/Video.h @@ -0,0 +1,42 @@ +#pragma once + +#ifndef VIDEO_H_ +#define VIDEO_H_ + + +#include +#include + +#include +#include +#include +#include + +#include "../decoder/Stream.h" + + + +struct VideoMetadata{ + double videoFps; // average frame rate for the video (float) + double videoDuration; // real world video duration in seconds (float) + double videoStartTime; // video start time in seconds (float) + // do we need a constructor here? +} + +class Video { + std::vector Metadata; + std::vector AvailStreams; // TODO: add stream type + public: + Video(std::string filename, std::string stream="video"); + void Seek(double ts, std::string stream="", bool any_frame=False); + torch::List Next(std::string stream="") + torch::List Peak(std::string stream="") + protected: + // AV container type (check in decoder for exact type) + private: + int64_t SecToStream(double ts); // TODO: add stream type + float StreamToSec(int64_t pts); // TODO: add stream type + void SetVideoStream(std::string stream="video:0") // this needs to be improved +} // class Video + +#endif // VIDEO_H_ diff --git a/video_reader.todo b/video_reader.todo new file mode 100644 index 00000000000..2a01bbde4b8 --- /dev/null +++ b/video_reader.todo @@ -0,0 +1,15 @@ +The new API: + ☐ the c++ extension is going to live in torchvision/csrc/cpu/video + ☐ modification of the build needs to go to setup.py + ☐ torchvision/io/_video something needs to happen somehow + +Tests changes: + ☐ test/test_io.py + ☐ test/test_video_reader.py (change to test video api) + + + +Implementation: + ☐ Datatype for strem + ☐ Datatype for container + ☐ Do I use tensor as a type in metadata \ No newline at end of file From 0abcfed6fbbe132dcd1f7c78e08a4b36072f5b19 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 14 Aug 2020 04:05:59 -0500 Subject: [PATCH 002/128] setup modification to actually build the thing --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 13c7a98ec74..6275b7b9f5a 100644 --- a/setup.py +++ b/setup.py @@ -331,10 +331,13 @@ def get_extensions(): base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder') base_decoder_src = glob.glob( os.path.join(base_decoder_src_dir, "*.cpp")) + # Torchvision video API + videoapi_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video') + videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp")) # exclude tests base_decoder_src = [x for x in base_decoder_src if '_test.cpp' not in x] - combined_src = video_reader_src + base_decoder_src + combined_src = video_reader_src + base_decoder_src + videoapi_src ext_modules.append( CppExtension( @@ -343,6 +346,7 @@ def get_extensions(): include_dirs=[ base_decoder_src_dir, video_reader_src_dir, + videoapi_src_dir, ffmpeg_include_dir, extensions_dir, ], From ecbec59ad21afc576ece351387b42ab7e00f52ad Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Mon, 17 Aug 2020 10:07:05 -0500 Subject: [PATCH 003/128] video api constructor registration --- torchvision/csrc/cpu/video/Video.cpp | 119 ++++++++++++++++++++++++ torchvision/csrc/cpu/video/Video.h | 32 ++++--- torchvision/csrc/cpu/video/register.cpp | 16 ++++ 3 files changed, 154 insertions(+), 13 deletions(-) create mode 100644 torchvision/csrc/cpu/video/register.cpp diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp index e69de29bb2d..cb3cfa3e275 100644 --- a/torchvision/csrc/cpu/video/Video.cpp +++ b/torchvision/csrc/cpu/video/Video.cpp @@ -0,0 +1,119 @@ + +# include "Video.h" +#include +#include +#include "sync_decoder.h" +#include "sync_decoder.h" +#include "memory_buffer.h" +#include "defs.h" + + +using namespace std; +using namespace ffmpeg; + + +// If we are in a Windows environment, we need to define +// initialization functions for the _custom_ops extension +#ifdef _WIN32 +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC init_video_reader(void) { + // No need to do anything. + return NULL; +} +#else +PyMODINIT_FUNC PyInit_video_reader(void) { + // No need to do anything. + return NULL; +} +#endif +#endif + + +// namespace Video{ +const size_t decoderTimeoutMs = 600000; +const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; +const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT; +// A jitter can be added to the end of the range to avoid conversion/rounding +// error, small value 100us won't be enough to select the next frame, but enough +// to compensate rounding error due to the multiple conversions. +const size_t timeBaseJitterUs = 100; + +void Video::_getDecoderParams( + int64_t videoStartUs, + int64_t getPtsOnly, + // how enum works, but stream type + int stream_id=-1, + double seekFrameMarginUs=10){ + + params.headerOnly = getPtsOnly != 0; + params.seekAccuracy = seekFrameMarginUs; + params.startOffset = videoStartUs; + params.timeoutMs = decoderTimeoutMs; + params.preventStaleness = false; // not sure what this is about + + // define the stream using the correct parsing technique +} // _get decoder params + + +Video::Video( + std::string videoPath, + std::string stream, + bool isReadFile, + int64_t audioSamples=0, + int64_t audioChannels=1) { + + + //parse stream information + + // set current stream + DecoderParameters params; + Video::_getDecoderParams( + 0, // video start + false, //headerOnly + // stream_type parsed from info above + // stream_id parsed from info above + audioSamples, + audioChannels + ); + + std::string logMessage, logType; + DecoderInCallback callback = nullptr; + // TODO: add read from memory option + params.uri = videoPath; + logType = "file"; + logMessage = videoPath; + + + // get a decoder + SyncDecoder decoder; + bool succeeded; + + VLOG(1) << "Video decoding from " << logType << " [" << logMessage + << "] has started"; + + DecoderMetadata audioMetadata, videoMetadata, dataMetadata; + std::vector metadata; + if ((succeeded = decoder.init(params, std::move(callback), &metadata))) { + for (const auto& header : metadata) { + VLOG(1) << "Decoding stream of" << header.format.type ; + if (header.format.type == TYPE_VIDEO) { + videoMetadata = header; + } else if (header.format.type == TYPE_AUDIO) { + audioMetadata = header; + } else { + dataMetadata = header; + }; + } + } +} //video + +// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){ +// } + +// torch::List Video::Next(){ +// return +// } + + + +// }; // namespace video \ No newline at end of file diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h index 13d05a90128..00e3232de26 100644 --- a/torchvision/csrc/cpu/video/Video.h +++ b/torchvision/csrc/cpu/video/Video.h @@ -12,7 +12,12 @@ #include #include -#include "../decoder/Stream.h" +#include +#include "sync_decoder.h" +#include "memory_buffer.h" +#include "defs.h" + +using namespace ffmpeg; @@ -21,22 +26,23 @@ struct VideoMetadata{ double videoDuration; // real world video duration in seconds (float) double videoStartTime; // video start time in seconds (float) // do we need a constructor here? -} +}; -class Video { +struct Video : torch::CustomClassHolder { std::vector Metadata; - std::vector AvailStreams; // TODO: add stream type + // std::vector AvailStreams; // TODO: add stream type public: - Video(std::string filename, std::string stream="video"); - void Seek(double ts, std::string stream="", bool any_frame=False); - torch::List Next(std::string stream="") - torch::List Peak(std::string stream="") - protected: + Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels); + // void Seek(double ts, std::string stream="", bool any_frame=False); + // torch::List Next(std::string stream="") + // torch::List Peak(std::string stream="") + // protected: // AV container type (check in decoder for exact type) private: - int64_t SecToStream(double ts); // TODO: add stream type - float StreamToSec(int64_t pts); // TODO: add stream type - void SetVideoStream(std::string stream="video:0") // this needs to be improved -} // class Video + DecoderParameters params; + // int64_t SecToStream(double ts); // TODO: add stream type + // float StreamToSec(int64_t pts); // TODO: add stream type + void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved +}; // class Video #endif // VIDEO_H_ diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp new file mode 100644 index 00000000000..e4dde7a5530 --- /dev/null +++ b/torchvision/csrc/cpu/video/register.cpp @@ -0,0 +1,16 @@ +#ifndef REGISTER_H +#define REGISTER_H + +#include "Video.h" + +namespace { + +//////////////////////////////////////////////////////////////////////////////// +// typedefs.h +//////////////////////////////////////////////////////////////////////////////// +static auto registerVideo = + torch::class_