From a1ad68bb3090bf6dd1a236ef3daaf432f7524dbc Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 13 Aug 2020 13:35:26 -0500
Subject: [PATCH 001/128] adding base files

---
 torchvision/csrc/cpu/video/Video.cpp |  0
 torchvision/csrc/cpu/video/Video.h   | 42 ++++++++++++++++++++++++++++
 video_reader.todo                    | 15 ++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 torchvision/csrc/cpu/video/Video.cpp
 create mode 100644 torchvision/csrc/cpu/video/Video.h
 create mode 100644 video_reader.todo
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
new file mode 100644
index 00000000000..13d05a90128
--- /dev/null
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#ifndef VIDEO_H_
+#define VIDEO_H_
+
+
+#include <string>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <Python.h>
+#include <c10/util/Logging.h>
+#include <torch/script.h>
+
+#include "../decoder/Stream.h"
+
+
+
+struct VideoMetadata{
+    double videoFps;  // average frame rate for the video (float)
+    double videoDuration; // real world video duration in seconds (float)
+    double videoStartTime; // video start time in seconds (float)
+    // do we need a constructor here?
+}
+
+class Video {
+    std::vector<VideoMetadata> Metadata;
+    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    public:
+        Video(std::string filename, std::string stream="video");
+        void Seek(double ts, std::string stream="", bool any_frame=False);
+        torch::List<torch::Tensor> Next(std::string stream="")
+        torch::List<torch::Tensor> Peak(std::string stream="")
+    protected:
+        // AV container type (check in decoder for exact type)
+    private:
+        int64_t SecToStream(double ts); // TODO: add stream type
+        float StreamToSec(int64_t pts); // TODO: add stream type
+        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
+} // class Video
+
+#endif  // VIDEO_H_
diff --git a/video_reader.todo b/video_reader.todo
new file mode 100644
index 00000000000..2a01bbde4b8
--- /dev/null
+++ b/video_reader.todo
@@ -0,0 +1,15 @@
+The new API:
+    ☐ the c++ extension is going to live in torchvision/csrc/cpu/video
+    ☐ modification of the build needs to go to setup.py
+    ☐ torchvision/io/_video something needs to happen somehow
+
+Tests changes:
+    ☐ test/test_io.py
+    ☐ test/test_video_reader.py (change to test video api)
+
+
+
+Implementation:
+    ☐ Datatype for strem
+    ☐ Datatype for container
+    ☐ Do I use tensor as a type in metadata 
\ No newline at end of file

From 0abcfed6fbbe132dcd1f7c78e08a4b36072f5b19 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 14 Aug 2020 04:05:59 -0500
Subject: [PATCH 002/128] setup modification to actually build the thing

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 13c7a98ec74..6275b7b9f5a 100644
--- a/setup.py
+++ b/setup.py
@@ -331,10 +331,13 @@ def get_extensions():
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(
             os.path.join(base_decoder_src_dir, "*.cpp"))
+        # Torchvision video API
+        videoapi_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video')
+        videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp"))
         # exclude tests
         base_decoder_src = [x for x in base_decoder_src if '_test.cpp' not in x]
 
-        combined_src = video_reader_src + base_decoder_src
+        combined_src = video_reader_src + base_decoder_src + videoapi_src
 
         ext_modules.append(
             CppExtension(
@@ -343,6 +346,7 @@ def get_extensions():
                 include_dirs=[
                     base_decoder_src_dir,
                     video_reader_src_dir,
+                    videoapi_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],

From ecbec59ad21afc576ece351387b42ab7e00f52ad Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 17 Aug 2020 10:07:05 -0500
Subject: [PATCH 003/128] video api constructor registration

---
 torchvision/csrc/cpu/video/Video.cpp    | 119 ++++++++++++++++++++++++
 torchvision/csrc/cpu/video/Video.h      |  32 ++++---
 torchvision/csrc/cpu/video/register.cpp |  16 ++++
 3 files changed, 154 insertions(+), 13 deletions(-)
 create mode 100644 torchvision/csrc/cpu/video/register.cpp

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index e69de29bb2d..cb3cfa3e275 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -0,0 +1,119 @@
+
+# include "Video.h"
+#include <torch/script.h>
+#include <c10/util/Logging.h>
+#include "sync_decoder.h"
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+
+using namespace std;
+using namespace ffmpeg;
+
+
+// If we are in a Windows environment, we need to define
+// initialization functions for the _custom_ops extension
+#ifdef _WIN32
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#else
+PyMODINIT_FUNC PyInit_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#endif
+#endif
+
+
+// namespace Video{
+const size_t decoderTimeoutMs = 600000;
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
+const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
+// A jitter can be added to the end of the range to avoid conversion/rounding
+// error, small value 100us won't be enough to select the next frame, but enough
+// to compensate rounding error due to the multiple conversions.
+const size_t timeBaseJitterUs = 100;
+
+void Video::_getDecoderParams(
+        int64_t videoStartUs,
+        int64_t getPtsOnly,
+        // how enum works, but stream type
+        int stream_id=-1,
+        double seekFrameMarginUs=10){
+
+    params.headerOnly = getPtsOnly != 0;
+    params.seekAccuracy = seekFrameMarginUs;
+    params.startOffset = videoStartUs;
+    params.timeoutMs = decoderTimeoutMs;
+    params.preventStaleness = false;  // not sure what this is about
+
+    // define the stream using the correct parsing technique
+} // _get decoder params
+
+
+Video::Video(
+    std::string videoPath, 
+    std::string stream, 
+    bool isReadFile, 
+    int64_t audioSamples=0, 
+    int64_t audioChannels=1) {
+
+
+    //parse stream information
+
+    // set current stream
+    DecoderParameters params;
+    Video::_getDecoderParams(
+        0,   // video start
+        false,  //headerOnly
+        // stream_type parsed from info above
+        // stream_id parsed from info above
+        audioSamples,
+        audioChannels
+    );
+
+    std::string logMessage, logType;
+    DecoderInCallback callback = nullptr;
+    // TODO: add read from memory option
+    params.uri = videoPath;
+    logType = "file";
+    logMessage = videoPath;
+    
+
+    // get a decoder
+    SyncDecoder decoder;
+    bool succeeded;
+
+    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+          << "] has started";
+
+    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    std::vector<DecoderMetadata> metadata;
+    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
+        for (const auto& header : metadata) {
+            VLOG(1) << "Decoding stream of" << header.format.type ;
+        if (header.format.type == TYPE_VIDEO) {
+            videoMetadata = header;
+        } else if (header.format.type == TYPE_AUDIO) {
+            audioMetadata = header;
+        } else {
+            dataMetadata = header;
+        };
+        }
+    } 
+} //video
+
+// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
+// }
+
+// torch::List<torch::Tensor> Video::Next(){
+//     return
+// }
+
+
+
+// }; // namespace video
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 13d05a90128..00e3232de26 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -12,7 +12,12 @@
 #include <c10/util/Logging.h>
 #include <torch/script.h>
 
-#include "../decoder/Stream.h"
+#include <exception>
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+using namespace ffmpeg;
 
 
 
@@ -21,22 +26,23 @@ struct VideoMetadata{
     double videoDuration; // real world video duration in seconds (float)
     double videoStartTime; // video start time in seconds (float)
     // do we need a constructor here?
-}
+};
 
-class Video {
+struct Video : torch::CustomClassHolder {
     std::vector<VideoMetadata> Metadata;
-    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // std::vector<Stream> AvailStreams;  // TODO: add stream type
     public:
-        Video(std::string filename, std::string stream="video");
-        void Seek(double ts, std::string stream="", bool any_frame=False);
-        torch::List<torch::Tensor> Next(std::string stream="")
-        torch::List<torch::Tensor> Peak(std::string stream="")
-    protected:
+        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        // void Seek(double ts, std::string stream="", bool any_frame=False);
+        // torch::List<torch::Tensor> Next(std::string stream="")
+        // torch::List<torch::Tensor> Peak(std::string stream="")
+    // protected:
         // AV container type (check in decoder for exact type)
     private:
-        int64_t SecToStream(double ts); // TODO: add stream type
-        float StreamToSec(int64_t pts); // TODO: add stream type
-        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
-} // class Video
+        DecoderParameters params;
+        // int64_t SecToStream(double ts); // TODO: add stream type
+        // float StreamToSec(int64_t pts); // TODO: add stream type
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+}; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
new file mode 100644
index 00000000000..e4dde7a5530
--- /dev/null
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -0,0 +1,16 @@
+#ifndef REGISTER_H
+#define REGISTER_H
+
+#include "Video.h"
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// typedefs.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
+
+} //namespace
+#endif

From 33d10bf2ec70357e65b3a3b3c5f86cd7605fb76a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:27:47 -0500
Subject: [PATCH 004/128] FAIL metadata

---
 torchvision/csrc/cpu/video/Video.cpp    | 91 ++++++++++++++++++-------
 torchvision/csrc/cpu/video/Video.h      | 29 +++++---
 torchvision/csrc/cpu/video/register.cpp | 13 ++--
 3 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index cb3cfa3e275..fb7a6891a6b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,7 +29,6 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 #endif
 
 
-// namespace Video{
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
@@ -43,6 +42,7 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         // how enum works, but stream type
         int stream_id=-1,
+        bool all_streams=false,
         double seekFrameMarginUs=10){
 
     params.headerOnly = getPtsOnly != 0;
@@ -51,6 +51,24 @@ void Video::_getDecoderParams(
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
+    if (all_streams == true){
+        MediaFormat audioFormat((long) -2);
+        audioFormat.type = TYPE_AUDIO;
+        audioFormat.format.audio.format = defaultAudioSampleFormat;
+        params.formats.insert(audioFormat);
+
+        MediaFormat videoFormat(0, (long) -2);
+        videoFormat.type = TYPE_VIDEO;
+        videoFormat.format.video.format = defaultVideoPixelFormat;
+        params.formats.insert(videoFormat);
+
+        // MediaFormat subtitleFormat("0", (long) -2);
+        // subtitleFormat.type = TYPE_SUBTITLE;
+        // MediaFormat ccFormat((double) 0, (long) -2);
+        // ccFormat.type = TYPE_CC;
+
+    }
+
     // define the stream using the correct parsing technique
 } // _get decoder params
 
@@ -58,22 +76,20 @@ void Video::_getDecoderParams(
 Video::Video(
     std::string videoPath, 
     std::string stream, 
-    bool isReadFile, 
-    int64_t audioSamples=0, 
-    int64_t audioChannels=1) {
+    bool isReadFile) {
 
 
     //parse stream information
 
     // set current stream
+    // note that in the initial version we want to get all streams
     DecoderParameters params;
     Video::_getDecoderParams(
-        0,   // video start
+        0,      // video start
         false,  //headerOnly
         // stream_type parsed from info above
-        // stream_id parsed from info above
-        audioSamples,
-        audioChannels
+        -2,     // stream_id parsed from info above
+        true    // read all streams
     );
 
     std::string logMessage, logType;
@@ -88,32 +104,55 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
-
-    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    
+    std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
             VLOG(1) << "Decoding stream of" << header.format.type ;
-        if (header.format.type == TYPE_VIDEO) {
-            videoMetadata = header;
-        } else if (header.format.type == TYPE_AUDIO) {
-            audioMetadata = header;
-        } else {
-            dataMetadata = header;
-        };
+        
+            // generate streamMetadata object
+            StreamMetadata streamInfo;
+            // parse stream timebase
+            torch::Tensor timeBase = torch::zeros({1}, torch::kFloat);
+            float * timeBaseData = timeBase.data_ptr<float>();
+            timeBaseData[0] = header.num / header.den;
+            streamInfo.timeBase = timeBase;
+            // parse stream duration
+            torch::Tensor duration = torch::zeros({1}, torch::kFloat);
+            float* durationData = duration.data_ptr<float>();
+            durationData[0] = (float) header.duration;
+            // to get duration in seconds multiply duration by timebase
+            streamInfo.duration = duration * streamInfo.timeBase;
+            
+            if (header.format.type == TYPE_VIDEO) {
+                // parse stream fps
+                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
+                float* frameRateData = frameRate.data_ptr<float>();
+                frameRateData[0] = header.fps;
+                streamInfo.frameRate = frameRate;
+                videoStreams.push_back(streamInfo);
+            } else if (header.format.type == TYPE_AUDIO) {
+                const auto& format = header.format.format.audio;
+                // parse stream fps
+                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
+                float* frameRateData = frameRate.data_ptr<float>();
+                frameRateData[0] = (float) format.samples;
+                streamInfo.frameRate = frameRate;
+                audioStreams.push_back(streamInfo);
+            };
         }
+        VideoMetadata.insert({"video", videoStreams});
+        VideoMetadata.insert({"autio", audioStreams});
     } 
 } //video
 
-// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
-// }
-
-// torch::List<torch::Tensor> Video::Next(){
-//     return
-// }
-
+// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+int Video::getMetadata() {
+    // return VideoMetadata;
+    return 5;
+}
 
 
-// }; // namespace video
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 00e3232de26..f9c104c1217 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,6 +11,8 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
+#include <torch/custom_class.h>
+
 
 #include <exception>
 #include "sync_decoder.h"
@@ -21,18 +23,29 @@ using namespace ffmpeg;
 
 
 
-struct VideoMetadata{
-    double videoFps;  // average frame rate for the video (float)
-    double videoDuration; // real world video duration in seconds (float)
-    double videoStartTime; // video start time in seconds (float)
+struct StreamMetadata{
+    torch::Tensor frameRate;  // average frame rate for the video (float)
+    torch::Tensor duration; // real world video duration in seconds (float)
+    // torch::Tensor startTime; // video start time in seconds (float)
+    torch::Tensor timeBase;
     // do we need a constructor here?
+    explicit StreamMetadata(){
+        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
+        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
+        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
+    }
 };
 
+
 struct Video : torch::CustomClassHolder {
-    std::vector<VideoMetadata> Metadata;
-    // std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // metadata is defined as a dictionary where every 
+    // type has a vector containing metadata for that stream
+    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
+    
     public:
-        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        Video(std::string videoPath, std::string stream, bool isReadFile);
+        int getMetadata();
+        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
@@ -42,7 +55,7 @@ struct Video : torch::CustomClassHolder {
         DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
 }; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index e4dde7a5530..4c6cc6c09cd 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,14 +3,11 @@
 
 #include "Video.h"
 
-namespace {
 
-////////////////////////////////////////////////////////////////////////////////
-// typedefs.h
-////////////////////////////////////////////////////////////////////////////////
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
+TORCH_LIBRARY(torchvision, m) {
 
-} //namespace
+    m.class_<Video>("video")
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
+}
 #endif

From 57763cfff6048f94a0270c18fd9348b1ff81a4c2 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:29:15 -0500
Subject: [PATCH 005/128] FAIL update for QS

---
 torchvision/csrc/cpu/video/Video.cpp | 10 +++++-----
 torchvision/csrc/cpu/video/Video.h   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index fb7a6891a6b..f86ac66375b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -145,14 +145,14 @@ Video::Video(
             };
         }
         VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"autio", audioStreams});
+        VideoMetadata.insert({"audio", audioStreams});
     } 
 } //video
 
-// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-int Video::getMetadata() {
-    // return VideoMetadata;
-    return 5;
+std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+// int Video::getMetadata() {
+    return VideoMetadata;
+    // return 5;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index f9c104c1217..9e02afe9713 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -44,7 +44,7 @@ struct Video : torch::CustomClassHolder {
     
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
-        int getMetadata();
+        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")

From 9ded798111375ab1f51915a4ac2c9c1c1976b892 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 05:23:51 -0500
Subject: [PATCH 006/128] revert

---
 torchvision/csrc/cpu/video/Video.cpp    |  4 ++--
 torchvision/csrc/cpu/video/register.cpp | 14 +++++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index f86ac66375b..4f871e5131c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -104,14 +104,14 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
+    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
     std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
-            VLOG(1) << "Decoding stream of" << header.format.type ;
+            cout << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
             StreamMetadata streamInfo;
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 4c6cc6c09cd..a8343762388 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,11 +3,15 @@
 
 #include "Video.h"
 
+namespace {
 
-TORCH_LIBRARY(torchvision, m) {
+////////////////////////////////////////////////////////////////////////////////
+// typedefs.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool>());
+        // .def("get_metadata", &Video::getMetadata);
 
-    m.class_<Video>("video")
-        .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
-}
+} //namespace
 #endif

From ac7f1e589e3c8a1d6671b101182bb928c6f22735 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 06:32:23 -0500
Subject: [PATCH 007/128] debugging with Victor

---
 torchvision/csrc/cpu/video/Video.cpp    |  8 ++++----
 torchvision/csrc/cpu/video/Video.h      | 16 +++++++---------
 torchvision/csrc/cpu/video/register.cpp |  7 ++-----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4f871e5131c..d62ece7a9e1 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -62,6 +62,7 @@ void Video::_getDecoderParams(
         videoFormat.format.video.format = defaultVideoPixelFormat;
         params.formats.insert(videoFormat);
 
+        // there is no clear way on how to use other formats- todo later
         // MediaFormat subtitleFormat("0", (long) -2);
         // subtitleFormat.type = TYPE_SUBTITLE;
         // MediaFormat ccFormat((double) 0, (long) -2);
@@ -69,7 +70,8 @@ void Video::_getDecoderParams(
 
     }
 
-    // define the stream using the correct parsing technique
+    // else use the stream using the correct parsing technique
+
 } // _get decoder params
 
 
@@ -139,7 +141,7 @@ Video::Video(
                 // parse stream fps
                 torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
                 float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples;
+                frameRateData[0] = (float) format.samples; // this is user defined? 
                 streamInfo.frameRate = frameRate;
                 audioStreams.push_back(streamInfo);
             };
@@ -150,9 +152,7 @@ Video::Video(
 } //video
 
 std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-// int Video::getMetadata() {
     return VideoMetadata;
-    // return 5;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 9e02afe9713..92a8e939918 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,7 +11,6 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
-#include <torch/custom_class.h>
 
 
 #include <exception>
@@ -22,7 +21,6 @@
 using namespace ffmpeg;
 
 
-
 struct StreamMetadata{
     torch::Tensor frameRate;  // average frame rate for the video (float)
     torch::Tensor duration; // real world video duration in seconds (float)
@@ -37,25 +35,25 @@ struct StreamMetadata{
 };
 
 
+
 struct Video : torch::CustomClassHolder {
     // metadata is defined as a dictionary where every 
     // type has a vector containing metadata for that stream
     std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
     
-    public:
-        Video(std::string videoPath, std::string stream, bool isReadFile);
-        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
+    Video(std::string videoPath, std::string stream, bool isReadFile);
+    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
     // protected:
         // AV container type (check in decoder for exact type)
-    private:
-        DecoderParameters params;
+    DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-}; // class Video
+    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+}; // struct Video
+
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index a8343762388..357f4ccfe4c 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -5,13 +5,10 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
-// typedefs.h
-////////////////////////////////////////////////////////////////////////////////
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool>());
-        // .def("get_metadata", &Video::getMetadata);
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
 
 } //namespace
 #endif

From 24718de6f1f07be3ca072cad3b675023a4182048 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 13 Aug 2020 13:35:26 -0500
Subject: [PATCH 008/128] adding base files

---
 torchvision/csrc/cpu/video/Video.cpp |  0
 torchvision/csrc/cpu/video/Video.h   | 42 ++++++++++++++++++++++++++++
 video_reader.todo                    | 15 ++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 torchvision/csrc/cpu/video/Video.cpp
 create mode 100644 torchvision/csrc/cpu/video/Video.h
 create mode 100644 video_reader.todo

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
new file mode 100644
index 00000000000..13d05a90128
--- /dev/null
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#ifndef VIDEO_H_
+#define VIDEO_H_
+
+
+#include <string>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <Python.h>
+#include <c10/util/Logging.h>
+#include <torch/script.h>
+
+#include "../decoder/Stream.h"
+
+
+
+struct VideoMetadata{
+    double videoFps;  // average frame rate for the video (float)
+    double videoDuration; // real world video duration in seconds (float)
+    double videoStartTime; // video start time in seconds (float)
+    // do we need a constructor here?
+}
+
+class Video {
+    std::vector<VideoMetadata> Metadata;
+    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    public:
+        Video(std::string filename, std::string stream="video");
+        void Seek(double ts, std::string stream="", bool any_frame=False);
+        torch::List<torch::Tensor> Next(std::string stream="")
+        torch::List<torch::Tensor> Peak(std::string stream="")
+    protected:
+        // AV container type (check in decoder for exact type)
+    private:
+        int64_t SecToStream(double ts); // TODO: add stream type
+        float StreamToSec(int64_t pts); // TODO: add stream type
+        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
+} // class Video
+
+#endif  // VIDEO_H_
diff --git a/video_reader.todo b/video_reader.todo
new file mode 100644
index 00000000000..2a01bbde4b8
--- /dev/null
+++ b/video_reader.todo
@@ -0,0 +1,15 @@
+The new API:
+    ☐ the c++ extension is going to live in torchvision/csrc/cpu/video
+    ☐ modification of the build needs to go to setup.py
+    ☐ torchvision/io/_video something needs to happen somehow
+
+Tests changes:
+    ☐ test/test_io.py
+    ☐ test/test_video_reader.py (change to test video api)
+
+
+
+Implementation:
+    ☐ Datatype for strem
+    ☐ Datatype for container
+    ☐ Do I use tensor as a type in metadata 
\ No newline at end of file

From 6800811746a9ebed9e1464265fb7eab3aa2a128c Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 14 Aug 2020 04:05:59 -0500
Subject: [PATCH 009/128] setup modification to actually build the thing

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1bc84897fa6..183da81fcb7 100644
--- a/setup.py
+++ b/setup.py
@@ -343,10 +343,13 @@ def get_extensions():
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(
             os.path.join(base_decoder_src_dir, "*.cpp"))
+        # Torchvision video API
+        videoapi_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video')
+        videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp"))
         # exclude tests
         base_decoder_src = [x for x in base_decoder_src if '_test.cpp' not in x]
 
-        combined_src = video_reader_src + base_decoder_src
+        combined_src = video_reader_src + base_decoder_src + videoapi_src
 
         ext_modules.append(
             CppExtension(
@@ -355,6 +358,7 @@ def get_extensions():
                 include_dirs=[
                     base_decoder_src_dir,
                     video_reader_src_dir,
+                    videoapi_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],

From 2cf981ce750ca166a9173fc4267f87ed552d2689 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 17 Aug 2020 10:07:05 -0500
Subject: [PATCH 010/128] video api constructor registration

---
 torchvision/csrc/cpu/video/Video.cpp    | 119 ++++++++++++++++++++++++
 torchvision/csrc/cpu/video/Video.h      |  32 ++++---
 torchvision/csrc/cpu/video/register.cpp |  16 ++++
 3 files changed, 154 insertions(+), 13 deletions(-)
 create mode 100644 torchvision/csrc/cpu/video/register.cpp

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index e69de29bb2d..cb3cfa3e275 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -0,0 +1,119 @@
+
+# include "Video.h"
+#include <torch/script.h>
+#include <c10/util/Logging.h>
+#include "sync_decoder.h"
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+
+using namespace std;
+using namespace ffmpeg;
+
+
+// If we are in a Windows environment, we need to define
+// initialization functions for the _custom_ops extension
+#ifdef _WIN32
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#else
+PyMODINIT_FUNC PyInit_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#endif
+#endif
+
+
+// namespace Video{
+const size_t decoderTimeoutMs = 600000;
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
+const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
+// A jitter can be added to the end of the range to avoid conversion/rounding
+// error, small value 100us won't be enough to select the next frame, but enough
+// to compensate rounding error due to the multiple conversions.
+const size_t timeBaseJitterUs = 100;
+
+void Video::_getDecoderParams(
+        int64_t videoStartUs,
+        int64_t getPtsOnly,
+        // how enum works, but stream type
+        int stream_id=-1,
+        double seekFrameMarginUs=10){
+
+    params.headerOnly = getPtsOnly != 0;
+    params.seekAccuracy = seekFrameMarginUs;
+    params.startOffset = videoStartUs;
+    params.timeoutMs = decoderTimeoutMs;
+    params.preventStaleness = false;  // not sure what this is about
+
+    // define the stream using the correct parsing technique
+} // _get decoder params
+
+
+Video::Video(
+    std::string videoPath, 
+    std::string stream, 
+    bool isReadFile, 
+    int64_t audioSamples=0, 
+    int64_t audioChannels=1) {
+
+
+    //parse stream information
+
+    // set current stream
+    DecoderParameters params;
+    Video::_getDecoderParams(
+        0,   // video start
+        false,  //headerOnly
+        // stream_type parsed from info above
+        // stream_id parsed from info above
+        audioSamples,
+        audioChannels
+    );
+
+    std::string logMessage, logType;
+    DecoderInCallback callback = nullptr;
+    // TODO: add read from memory option
+    params.uri = videoPath;
+    logType = "file";
+    logMessage = videoPath;
+    
+
+    // get a decoder
+    SyncDecoder decoder;
+    bool succeeded;
+
+    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+          << "] has started";
+
+    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    std::vector<DecoderMetadata> metadata;
+    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
+        for (const auto& header : metadata) {
+            VLOG(1) << "Decoding stream of" << header.format.type ;
+        if (header.format.type == TYPE_VIDEO) {
+            videoMetadata = header;
+        } else if (header.format.type == TYPE_AUDIO) {
+            audioMetadata = header;
+        } else {
+            dataMetadata = header;
+        };
+        }
+    } 
+} //video
+
+// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
+// }
+
+// torch::List<torch::Tensor> Video::Next(){
+//     return
+// }
+
+
+
+// }; // namespace video
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 13d05a90128..00e3232de26 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -12,7 +12,12 @@
 #include <c10/util/Logging.h>
 #include <torch/script.h>
 
-#include "../decoder/Stream.h"
+#include <exception>
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+using namespace ffmpeg;
 
 
 
@@ -21,22 +26,23 @@ struct VideoMetadata{
     double videoDuration; // real world video duration in seconds (float)
     double videoStartTime; // video start time in seconds (float)
     // do we need a constructor here?
-}
+};
 
-class Video {
+struct Video : torch::CustomClassHolder {
     std::vector<VideoMetadata> Metadata;
-    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // std::vector<Stream> AvailStreams;  // TODO: add stream type
     public:
-        Video(std::string filename, std::string stream="video");
-        void Seek(double ts, std::string stream="", bool any_frame=False);
-        torch::List<torch::Tensor> Next(std::string stream="")
-        torch::List<torch::Tensor> Peak(std::string stream="")
-    protected:
+        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        // void Seek(double ts, std::string stream="", bool any_frame=False);
+        // torch::List<torch::Tensor> Next(std::string stream="")
+        // torch::List<torch::Tensor> Peak(std::string stream="")
+    // protected:
         // AV container type (check in decoder for exact type)
     private:
-        int64_t SecToStream(double ts); // TODO: add stream type
-        float StreamToSec(int64_t pts); // TODO: add stream type
-        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
-} // class Video
+        DecoderParameters params;
+        // int64_t SecToStream(double ts); // TODO: add stream type
+        // float StreamToSec(int64_t pts); // TODO: add stream type
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+}; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
new file mode 100644
index 00000000000..e4dde7a5530
--- /dev/null
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -0,0 +1,16 @@
+#ifndef REGISTER_H
+#define REGISTER_H
+
+#include "Video.h"
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// typedefs.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
+
+} //namespace
+#endif

From 30263e4a522effdff0442ae26a9a57c5f144a88c Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:27:47 -0500
Subject: [PATCH 011/128] FAIL metadata

---
 torchvision/csrc/cpu/video/Video.cpp    | 91 ++++++++++++++++++-------
 torchvision/csrc/cpu/video/Video.h      | 29 +++++---
 torchvision/csrc/cpu/video/register.cpp | 13 ++--
 3 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index cb3cfa3e275..fb7a6891a6b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,7 +29,6 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 #endif
 
 
-// namespace Video{
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
@@ -43,6 +42,7 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         // how enum works, but stream type
         int stream_id=-1,
+        bool all_streams=false,
         double seekFrameMarginUs=10){
 
     params.headerOnly = getPtsOnly != 0;
@@ -51,6 +51,24 @@ void Video::_getDecoderParams(
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
+    if (all_streams == true){
+        MediaFormat audioFormat((long) -2);
+        audioFormat.type = TYPE_AUDIO;
+        audioFormat.format.audio.format = defaultAudioSampleFormat;
+        params.formats.insert(audioFormat);
+
+        MediaFormat videoFormat(0, (long) -2);
+        videoFormat.type = TYPE_VIDEO;
+        videoFormat.format.video.format = defaultVideoPixelFormat;
+        params.formats.insert(videoFormat);
+
+        // MediaFormat subtitleFormat("0", (long) -2);
+        // subtitleFormat.type = TYPE_SUBTITLE;
+        // MediaFormat ccFormat((double) 0, (long) -2);
+        // ccFormat.type = TYPE_CC;
+
+    }
+
     // define the stream using the correct parsing technique
 } // _get decoder params
 
@@ -58,22 +76,20 @@ void Video::_getDecoderParams(
 Video::Video(
     std::string videoPath, 
     std::string stream, 
-    bool isReadFile, 
-    int64_t audioSamples=0, 
-    int64_t audioChannels=1) {
+    bool isReadFile) {
 
 
     //parse stream information
 
     // set current stream
+    // note that in the initial version we want to get all streams
     DecoderParameters params;
     Video::_getDecoderParams(
-        0,   // video start
+        0,      // video start
         false,  //headerOnly
         // stream_type parsed from info above
-        // stream_id parsed from info above
-        audioSamples,
-        audioChannels
+        -2,     // stream_id parsed from info above
+        true    // read all streams
     );
 
     std::string logMessage, logType;
@@ -88,32 +104,55 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
-
-    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    
+    std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
             VLOG(1) << "Decoding stream of" << header.format.type ;
-        if (header.format.type == TYPE_VIDEO) {
-            videoMetadata = header;
-        } else if (header.format.type == TYPE_AUDIO) {
-            audioMetadata = header;
-        } else {
-            dataMetadata = header;
-        };
+        
+            // generate streamMetadata object
+            StreamMetadata streamInfo;
+            // parse stream timebase
+            torch::Tensor timeBase = torch::zeros({1}, torch::kFloat);
+            float * timeBaseData = timeBase.data_ptr<float>();
+            timeBaseData[0] = header.num / header.den;
+            streamInfo.timeBase = timeBase;
+            // parse stream duration
+            torch::Tensor duration = torch::zeros({1}, torch::kFloat);
+            float* durationData = duration.data_ptr<float>();
+            durationData[0] = (float) header.duration;
+            // to get duration in seconds multiply duration by timebase
+            streamInfo.duration = duration * streamInfo.timeBase;
+            
+            if (header.format.type == TYPE_VIDEO) {
+                // parse stream fps
+                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
+                float* frameRateData = frameRate.data_ptr<float>();
+                frameRateData[0] = header.fps;
+                streamInfo.frameRate = frameRate;
+                videoStreams.push_back(streamInfo);
+            } else if (header.format.type == TYPE_AUDIO) {
+                const auto& format = header.format.format.audio;
+                // parse stream fps
+                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
+                float* frameRateData = frameRate.data_ptr<float>();
+                frameRateData[0] = (float) format.samples;
+                streamInfo.frameRate = frameRate;
+                audioStreams.push_back(streamInfo);
+            };
         }
+        VideoMetadata.insert({"video", videoStreams});
+        VideoMetadata.insert({"autio", audioStreams});
     } 
 } //video
 
-// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
-// }
-
-// torch::List<torch::Tensor> Video::Next(){
-//     return
-// }
-
+// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+int Video::getMetadata() {
+    // return VideoMetadata;
+    return 5;
+}
 
 
-// }; // namespace video
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 00e3232de26..f9c104c1217 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,6 +11,8 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
+#include <torch/custom_class.h>
+
 
 #include <exception>
 #include "sync_decoder.h"
@@ -21,18 +23,29 @@ using namespace ffmpeg;
 
 
 
-struct VideoMetadata{
-    double videoFps;  // average frame rate for the video (float)
-    double videoDuration; // real world video duration in seconds (float)
-    double videoStartTime; // video start time in seconds (float)
+struct StreamMetadata{
+    torch::Tensor frameRate;  // average frame rate for the video (float)
+    torch::Tensor duration; // real world video duration in seconds (float)
+    // torch::Tensor startTime; // video start time in seconds (float)
+    torch::Tensor timeBase;
     // do we need a constructor here?
+    explicit StreamMetadata(){
+        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
+        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
+        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
+    }
 };
 
+
 struct Video : torch::CustomClassHolder {
-    std::vector<VideoMetadata> Metadata;
-    // std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // metadata is defined as a dictionary where every 
+    // type has a vector containing metadata for that stream
+    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
+    
     public:
-        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        Video(std::string videoPath, std::string stream, bool isReadFile);
+        int getMetadata();
+        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
@@ -42,7 +55,7 @@ struct Video : torch::CustomClassHolder {
         DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
 }; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index e4dde7a5530..4c6cc6c09cd 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,14 +3,11 @@
 
 #include "Video.h"
 
-namespace {
 
-////////////////////////////////////////////////////////////////////////////////
-// typedefs.h
-////////////////////////////////////////////////////////////////////////////////
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
+TORCH_LIBRARY(torchvision, m) {
 
-} //namespace
+    m.class_<Video>("video")
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
+}
 #endif

From d58e8b7c4b85b1ea281674483d574090a18ae795 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:29:15 -0500
Subject: [PATCH 012/128] FAIL update for QS

---
 torchvision/csrc/cpu/video/Video.cpp | 10 +++++-----
 torchvision/csrc/cpu/video/Video.h   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index fb7a6891a6b..f86ac66375b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -145,14 +145,14 @@ Video::Video(
             };
         }
         VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"autio", audioStreams});
+        VideoMetadata.insert({"audio", audioStreams});
     } 
 } //video
 
-// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-int Video::getMetadata() {
-    // return VideoMetadata;
-    return 5;
+std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+// int Video::getMetadata() {
+    return VideoMetadata;
+    // return 5;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index f9c104c1217..9e02afe9713 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -44,7 +44,7 @@ struct Video : torch::CustomClassHolder {
     
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
-        int getMetadata();
+        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")

From f5657ec48e5b32049f5e25fe18f69a9c72592c7e Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 05:23:51 -0500
Subject: [PATCH 013/128] revert

---
 torchvision/csrc/cpu/video/Video.cpp    |  4 ++--
 torchvision/csrc/cpu/video/register.cpp | 14 +++++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index f86ac66375b..4f871e5131c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -104,14 +104,14 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
+    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
     std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
-            VLOG(1) << "Decoding stream of" << header.format.type ;
+            cout << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
             StreamMetadata streamInfo;
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 4c6cc6c09cd..a8343762388 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,11 +3,15 @@
 
 #include "Video.h"
 
+namespace {
 
-TORCH_LIBRARY(torchvision, m) {
+////////////////////////////////////////////////////////////////////////////////
+// typedefs.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool>());
+        // .def("get_metadata", &Video::getMetadata);
 
-    m.class_<Video>("video")
-        .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
-}
+} //namespace
 #endif

From f5284ec40a5a86b76c7776526ed4799ec1228375 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 06:32:23 -0500
Subject: [PATCH 014/128] debugging with Victor

---
 torchvision/csrc/cpu/video/Video.cpp    |  8 ++++----
 torchvision/csrc/cpu/video/Video.h      | 16 +++++++---------
 torchvision/csrc/cpu/video/register.cpp |  7 ++-----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4f871e5131c..d62ece7a9e1 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -62,6 +62,7 @@ void Video::_getDecoderParams(
         videoFormat.format.video.format = defaultVideoPixelFormat;
         params.formats.insert(videoFormat);
 
+        // there is no clear way on how to use other formats- todo later
         // MediaFormat subtitleFormat("0", (long) -2);
         // subtitleFormat.type = TYPE_SUBTITLE;
         // MediaFormat ccFormat((double) 0, (long) -2);
@@ -69,7 +70,8 @@ void Video::_getDecoderParams(
 
     }
 
-    // define the stream using the correct parsing technique
+    // else use the stream using the correct parsing technique
+
 } // _get decoder params
 
 
@@ -139,7 +141,7 @@ Video::Video(
                 // parse stream fps
                 torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
                 float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples;
+                frameRateData[0] = (float) format.samples; // this is user defined? 
                 streamInfo.frameRate = frameRate;
                 audioStreams.push_back(streamInfo);
             };
@@ -150,9 +152,7 @@ Video::Video(
 } //video
 
 std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-// int Video::getMetadata() {
     return VideoMetadata;
-    // return 5;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 9e02afe9713..92a8e939918 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,7 +11,6 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
-#include <torch/custom_class.h>
 
 
 #include <exception>
@@ -22,7 +21,6 @@
 using namespace ffmpeg;
 
 
-
 struct StreamMetadata{
     torch::Tensor frameRate;  // average frame rate for the video (float)
     torch::Tensor duration; // real world video duration in seconds (float)
@@ -37,25 +35,25 @@ struct StreamMetadata{
 };
 
 
+
 struct Video : torch::CustomClassHolder {
     // metadata is defined as a dictionary where every 
     // type has a vector containing metadata for that stream
     std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
     
-    public:
-        Video(std::string videoPath, std::string stream, bool isReadFile);
-        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
+    Video(std::string videoPath, std::string stream, bool isReadFile);
+    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
     // protected:
         // AV container type (check in decoder for exact type)
-    private:
-        DecoderParameters params;
+    DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-}; // class Video
+    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+}; // struct Video
+
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index a8343762388..357f4ccfe4c 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -5,13 +5,10 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
-// typedefs.h
-////////////////////////////////////////////////////////////////////////////////
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool>());
-        // .def("get_metadata", &Video::getMetadata);
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
 
 } //namespace
 #endif

From 1398dd6a190d114a7f82d456c781c6023b239b88 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 21 Aug 2020 05:14:22 -0500
Subject: [PATCH 015/128] metadata registration works

---
 torchvision/csrc/cpu/video/Video.cpp    | 157 +++++++++++++++++-------
 torchvision/csrc/cpu/video/Video.h      |  43 ++++---
 torchvision/csrc/cpu/video/register.cpp |   3 +-
 3 files changed, 138 insertions(+), 65 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d62ece7a9e1..449bda97e82 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -37,11 +37,77 @@ const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // to compensate rounding error due to the multiple conversions.
 const size_t timeBaseJitterUs = 100;
 
+
+std::string parse_type_to_string(const std::string& stream_string) {
+  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
+      {"video", TYPE_VIDEO},
+      {"audio", TYPE_AUDIO},
+      {"subtitle", TYPE_SUBTITLE},
+      {"cc", TYPE_CC},
+  }};
+  auto device = std::find_if(
+      types.begin(),
+      types.end(),
+      [stream_string](const std::pair<std::string, MediaType>& p) {
+        return p.first == stream_string;
+      });
+  if (device != types.end()) {
+    return device->first;
+  }
+  AT_ERROR(
+      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+}
+
+MediaType parse_type_to_mt(const std::string& stream_string) {
+  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
+      {"video", TYPE_VIDEO},
+      {"audio", TYPE_AUDIO},
+      {"subtitle", TYPE_SUBTITLE},
+      {"cc", TYPE_CC},
+  }};
+  auto device = std::find_if(
+      types.begin(),
+      types.end(),
+      [stream_string](const std::pair<std::string, MediaType>& p) {
+        return p.first == stream_string;
+      });
+  if (device != types.end()) {
+    return device->second;
+  }
+  AT_ERROR(
+      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+}
+
+std::tuple<std::string, int64_t> Video::_parseStream(const std::string& streamString){
+    TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
+    static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
+    std::smatch match;
+
+    TORCH_CHECK(
+        std::regex_match(streamString, match, regex),
+        "Invalid stream string: '", streamString, "'");
+    
+    std::string type_ = "video";
+    type_ = parse_type_to_string(match[1].str());
+    int64_t index_ = -1;
+    if (match[2].matched) {
+        try {
+        index_ = c10::stoi(match[2].str());
+        } catch (const std::exception &) {
+        AT_ERROR(
+            "Could not parse device index '", match[2].str(),
+            "' in device string '", streamString, "'");
+        }
+    }
+    return std::make_tuple(type_, index_);
+}
+
+
 void Video::_getDecoderParams(
         int64_t videoStartUs,
         int64_t getPtsOnly,
-        // how enum works, but stream type
-        int stream_id=-1,
+        std::string stream,
+        long stream_id=-1,
         bool all_streams=false,
         double seekFrameMarginUs=10){
 
@@ -63,10 +129,13 @@ void Video::_getDecoderParams(
         params.formats.insert(videoFormat);
 
         // there is no clear way on how to use other formats- todo later
-        // MediaFormat subtitleFormat("0", (long) -2);
-        // subtitleFormat.type = TYPE_SUBTITLE;
-        // MediaFormat ccFormat((double) 0, (long) -2);
-        // ccFormat.type = TYPE_CC;
+        MediaFormat subtitleFormat(char('0'), long(-2));
+        subtitleFormat.type = TYPE_SUBTITLE;
+        params.formats.insert(subtitleFormat);
+
+        MediaFormat ccFormat(double(0), long(-2));
+        ccFormat.type = TYPE_CC;
+        params.formats.insert(ccFormat);
 
     }
 
@@ -80,17 +149,15 @@ Video::Video(
     std::string stream, 
     bool isReadFile) {
 
-
     //parse stream information
-
-    // set current stream
+    current_stream = _parseStream(stream);
     // note that in the initial version we want to get all streams
-    DecoderParameters params;
+
     Video::_getDecoderParams(
         0,      // video start
         false,  //headerOnly
-        // stream_type parsed from info above
-        -2,     // stream_id parsed from info above
+        get<0>(current_stream),
+        long(-2),     // stream_id parsed from info above
         true    // read all streams
     );
 
@@ -103,56 +170,62 @@ Video::Video(
     
 
     // get a decoder
-    SyncDecoder decoder;
     bool succeeded;
 
     cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
-    std::vector<StreamMetadata> videoStreams, audioStreams;
+    std::vector<double> videoFPS, audioFPS, ccFPS, subtitleFPS;
+
     std::vector<DecoderMetadata> metadata;
-    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
+    succeeded = decoder.init(params, std::move(callback), &metadata);
+    if (succeeded) {
         for (const auto& header : metadata) {
             cout << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
-            StreamMetadata streamInfo;
+            // std::map<std::string, double> streamInfo;
             // parse stream timebase
-            torch::Tensor timeBase = torch::zeros({1}, torch::kFloat);
-            float * timeBaseData = timeBase.data_ptr<float>();
-            timeBaseData[0] = header.num / header.den;
-            streamInfo.timeBase = timeBase;
+            // streamInfo.insert({"timeBase", (double) (header.num / header.den)});
             // parse stream duration
-            torch::Tensor duration = torch::zeros({1}, torch::kFloat);
-            float* durationData = duration.data_ptr<float>();
-            durationData[0] = (float) header.duration;
             // to get duration in seconds multiply duration by timebase
-            streamInfo.duration = duration * streamInfo.timeBase;
-            
+            // streamInfo.insert({"duration", (double) header.duration * (double) (header.num / header.den)});
+                        
             if (header.format.type == TYPE_VIDEO) {
                 // parse stream fps
-                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
-                float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = header.fps;
-                streamInfo.frameRate = frameRate;
-                videoStreams.push_back(streamInfo);
+                double fps = double(header.fps);
+                videoFPS.push_back(fps);
             } else if (header.format.type == TYPE_AUDIO) {
-                const auto& format = header.format.format.audio;
-                // parse stream fps
-                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
-                float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples; // this is user defined? 
-                streamInfo.frameRate = frameRate;
-                audioStreams.push_back(streamInfo);
+                // parse stream fps (user defined, doesn't seem cool)
+                double fps = double(0);
+                audioFPS.push_back(fps);
+            } else{
+                cout << "Got type" << header.format.type; 
             };
         }
-        VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"audio", audioStreams});
-    } 
+
+    } else{
+        audioFPS.push_back((-1.0));
+        videoFPS.push_back((-1.0));
+
+    }
+    streamMetadata.insert({"video", videoFPS});
+    streamMetadata.insert({"audio", audioFPS});
 } //video
 
-std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-    return VideoMetadata;
+std::tuple<std::string, int64_t> Video::getCurrentStream() const {
+    return current_stream;
+}
+
+std::vector<double> Video::getFPS(std::string stream) const{
+    // add safety check
+    std::string stream_str = parse_type_to_string(stream);
+    return streamMetadata.at(stream_str);
 }
 
 
+// std::map<std::string, std::vector<std::map<std::string, double>>> Video::getMetadata() const {
+//     return VideoMetadata;
+// }
+
+
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 92a8e939918..9c80cb54971 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -6,6 +6,8 @@
 
 #include <string>
 #include <vector>
+#include <regex>
+#include <map>
 
 #include <ATen/ATen.h>
 #include <Python.h>
@@ -21,38 +23,35 @@
 using namespace ffmpeg;
 
 
-struct StreamMetadata{
-    torch::Tensor frameRate;  // average frame rate for the video (float)
-    torch::Tensor duration; // real world video duration in seconds (float)
-    // torch::Tensor startTime; // video start time in seconds (float)
-    torch::Tensor timeBase;
-    // do we need a constructor here?
-    explicit StreamMetadata(){
-        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
-        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
-        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
-    }
-};
-
 
 
 struct Video : torch::CustomClassHolder {
     // metadata is defined as a dictionary where every 
-    // type has a vector containing metadata for that stream
-    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
-    
-    Video(std::string videoPath, std::string stream, bool isReadFile);
-    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
+    // type value is a list of lists that contains tuple <char: "info", double: "value">
+    std::tuple<std::string, int64_t> current_stream;
+    std::map<std::string, std::vector<double>> streamMetadata;
+    public:
+        Video(std::string videoPath, std::string stream, bool isReadFile);
+        std::tuple<std::string, int64_t> getCurrentStream() const;
+        std::vector<double> getFPS(std::string stream) const;
+
+    private:
+        std::tuple<std::string, int64_t> _parseStream(const std::string& streamString);
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+
+    // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
-    // protected:
+    protected:
         // AV container type (check in decoder for exact type)
-    DecoderParameters params;
+        SyncDecoder decoder;
+        DecoderParameters params;
+
         // int64_t SecToStream(double ts); // TODO: add stream type
-        // float StreamToSec(int64_t pts); // TODO: add stream type
-    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        // double StreamToSec(int64_t pts); // TODO: add stream type
+    
 }; // struct Video
 
 
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 357f4ccfe4c..bfa3d58cec7 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -8,7 +8,8 @@ namespace {
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
+        .def("get_current_stream", &Video::getCurrentStream)
+        .def("get_FPS", &Video::getFPS);
 
 } //namespace
 #endif

From c124bb16491efb3d139daa4fd9ddf76827cc14a9 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 26 Aug 2020 04:34:51 -0500
Subject: [PATCH 016/128] API build next

---
 torchvision/csrc/cpu/video/Video.cpp    | 165 +++++++++++++++++++-----
 torchvision/csrc/cpu/video/Video.h      |  26 ++--
 torchvision/csrc/cpu/video/register.cpp |   5 +-
 3 files changed, 153 insertions(+), 43 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 449bda97e82..997ba537e9f 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -78,7 +78,7 @@ MediaType parse_type_to_mt(const std::string& stream_string) {
       "Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::tuple<std::string, int64_t> Video::_parseStream(const std::string& streamString){
+std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
     TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
     static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
     std::smatch match;
@@ -137,9 +137,47 @@ void Video::_getDecoderParams(
         ccFormat.type = TYPE_CC;
         params.formats.insert(ccFormat);
 
-    }
+    } else{
+        // TODO: reset params.formats
+        std::set<MediaFormat> formats;
+        params.formats = formats;
+        MediaType stream_type = parse_type_to_mt(stream);
+        // now here is a mindfuck 
+        // - there is no way to construct mediaformat by type so we actually
+        // need an endless if/then
+        switch(stream_type) {
+            case TYPE_VIDEO:
+            {
+                MediaFormat videoFormat(0, (long) stream_id);
+                videoFormat.type = TYPE_VIDEO;
+                videoFormat.format.video.format = defaultVideoPixelFormat;
+                params.formats.insert(videoFormat);
+                break;
+            }
+            case TYPE_AUDIO:
+            {        
+                MediaFormat audioFormat((long) stream_id);
+                audioFormat.type = TYPE_AUDIO;
+                audioFormat.format.audio.format = defaultAudioSampleFormat;
+                params.formats.insert(audioFormat);
+                break;
+            }
+            // case TYPE_CC:
+            //     MediaFormat subtitleFormat(char('0'), long(stream_id));
+            //     subtitleFormat.type = TYPE_SUBTITLE;
+            //     params.formats.insert(subtitleFormat);
+            //     break;
+            default:
+            {
+                MediaFormat videoFormat(0, (long) -1);
+                videoFormat.type = TYPE_VIDEO;
+                videoFormat.format.video.format = defaultVideoPixelFormat;
+                params.formats.insert(videoFormat);
+                break;
+            }
+        }
 
-    // else use the stream using the correct parsing technique
+    }
 
 } // _get decoder params
 
@@ -162,7 +200,7 @@ Video::Video(
     );
 
     std::string logMessage, logType;
-    DecoderInCallback callback = nullptr;
+    
     // TODO: add read from memory option
     params.uri = videoPath;
     logType = "file";
@@ -175,42 +213,50 @@ Video::Video(
     cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
-    std::vector<double> videoFPS, audioFPS, ccFPS, subtitleFPS;
 
-    std::vector<DecoderMetadata> metadata;
+    
+    std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
+    std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
+    std::vector<double> audioTB, videoTB, ccTB, subsTB;
+
+    // calback and metadata defined in struct
+    callback = nullptr;
     succeeded = decoder.init(params, std::move(callback), &metadata);
     if (succeeded) {
         for (const auto& header : metadata) {
-            cout << "Decoding stream of" << header.format.type ;
-        
-            // generate streamMetadata object
-            // std::map<std::string, double> streamInfo;
-            // parse stream timebase
-            // streamInfo.insert({"timeBase", (double) (header.num / header.den)});
-            // parse stream duration
-            // to get duration in seconds multiply duration by timebase
-            // streamInfo.insert({"duration", (double) header.duration * (double) (header.num / header.den)});
-                        
+            double fps = double(header.fps);
+            double timeBase = double(header.num) / double(header.den);
+            double duration = double(header.duration) * 1e-6; // * timeBase;
+
+
+            cout << "Decoding stream of" << header.format.type;
+            cout << "duration " << duration << " tb" << timeBase << " " << double(header.num) << " " <<double(header.num);
+
+
             if (header.format.type == TYPE_VIDEO) {
-                // parse stream fps
-                double fps = double(header.fps);
                 videoFPS.push_back(fps);
+                videoDuration.push_back(duration);
+                videoTB.push_back(timeBase);
             } else if (header.format.type == TYPE_AUDIO) {
-                // parse stream fps (user defined, doesn't seem cool)
-                double fps = double(0);
                 audioFPS.push_back(fps);
-            } else{
-                cout << "Got type" << header.format.type; 
+                audioDuration.push_back(duration);
+                audioTB.push_back(timeBase);
+            } else if (header.format.type == TYPE_CC){
+                ccFPS.push_back(fps);
+                ccDuration.push_back(duration);
+                ccTB.push_back(timeBase);
+            } else if (header.format.type == TYPE_SUBTITLE){
+                subsFPS.push_back(fps);
+                subsDuration.push_back(duration);
+                subsTB.push_back(timeBase);
             };
         }
 
-    } else{
-        audioFPS.push_back((-1.0));
-        videoFPS.push_back((-1.0));
-
     }
-    streamMetadata.insert({"video", videoFPS});
-    streamMetadata.insert({"audio", audioFPS});
+
+    streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
+    streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+    streamTimeBase.insert({{"video", videoTB}, {"audio", audioTB}});
 } //video
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
@@ -219,13 +265,66 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
 
 std::vector<double> Video::getFPS(std::string stream) const{
     // add safety check
-    std::string stream_str = parse_type_to_string(stream);
-    return streamMetadata.at(stream_str);
+    if (stream.empty()){
+        stream = get<0>(current_stream);
+    }
+    auto stream_tpl = _parseStream(stream);
+    std::string stream_str = get<0>(stream_tpl);
+    // check if the stream exists
+    return streamFPS.at(stream_str);
+}
+
+std::vector<double> Video::getDuration(std::string stream) const{
+    // add safety check
+    if (stream.empty()){
+        stream = get<0>(current_stream);
+    }
+    auto stream_tpl = _parseStream(stream);
+    std::string stream_str = get<0>(stream_tpl);
+    // check if the stream exists
+    return streamDuration.at(stream_str);
 }
 
+int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
+    if (stream.empty()){
+        stream = get<0>(current_stream);
+    }
+    auto stream_tpl = _parseStream(stream);
+    // check if the stream exists
+
+    // convert time to microseconds and cast to unsigned long int
+    int64_t ts_out = int64_t(ts * 1e6);
+
+    Video::_getDecoderParams(
+        ts_out,
+        0, // we're in full get frame mode
+        get<0>(stream_tpl),
+        get<1>(stream_tpl),
+        false);
+    
+    bool succeeded = decoder.init(params, std::move(callback), &metadata);
+    if (succeeded){
+        return 0;
+    }
+
+    return 1;
+
+}
+
+
+int64_t Video::Next(std::string stream=""){
+
+    DecoderOutputMessage out;
+    int64_t res = decoder.decode(&out, decoderTimeoutMs);
+
+    if (res == 0){
+        return 0;
+    }
+    
+    return 1;
+}
+
+
 
-// std::map<std::string, std::vector<std::map<std::string, double>>> Video::getMetadata() const {
-//     return VideoMetadata;
-// }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 9c80cb54971..4e8d10f5d8f 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -19,6 +19,7 @@
 #include "sync_decoder.h"
 #include "memory_buffer.h"
 #include "defs.h"
+#include "util.h"
 
 using namespace ffmpeg;
 
@@ -26,28 +27,35 @@ using namespace ffmpeg;
 
 
 struct Video : torch::CustomClassHolder {
-    // metadata is defined as a dictionary where every 
-    // type value is a list of lists that contains tuple <char: "info", double: "value">
+    bool any_frame=false; // add this to input parameters
     std::tuple<std::string, int64_t> current_stream;
-    std::map<std::string, std::vector<double>> streamMetadata;
+    std::map<std::string, std::vector<double>> streamFPS;
+    std::map<std::string, std::vector<double>> streamDuration;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
         std::tuple<std::string, int64_t> getCurrentStream() const;
-        std::vector<double> getFPS(std::string stream) const;
+        std::vector<double> getDuration(std::string stream="") const;
+        std::vector<double> getFPS(std::string stream="") const;
+        int64_t Seek(double ts, std::string stream, bool any_frame);
+        int64_t Next(std::string stream); //torch::List<torch::Tensor>
 
     private:
-        std::tuple<std::string, int64_t> _parseStream(const std::string& streamString);
         void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        std::map<std::string, std::vector<double>> streamTimeBase;
 
+        SyncDecoder decoder;
+        DecoderParameters params;
+
+        DecoderInCallback callback;
+        std::vector<DecoderMetadata> metadata;
     // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
-        // void Seek(double ts, std::string stream="", bool any_frame=False);
-        // torch::List<torch::Tensor> Next(std::string stream="")
+        
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:
         // AV container type (check in decoder for exact type)
-        SyncDecoder decoder;
-        DecoderParameters params;
+        
+        
 
         // int64_t SecToStream(double ts); // TODO: add stream type
         // double StreamToSec(int64_t pts); // TODO: add stream type
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index bfa3d58cec7..091052f4808 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -9,7 +9,10 @@ static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
         .def("get_current_stream", &Video::getCurrentStream)
-        .def("get_FPS", &Video::getFPS);
+        .def("duration", &Video::getDuration)
+        .def("fps", &Video::getFPS)
+        .def("seek", &Video::Seek)
+        .def("next", &Video::Next);
 
 } //namespace
 #endif

From c43c729ba1d7245f2c58cd1c06594de0f25e1e46 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 27 Aug 2020 04:15:41 -0500
Subject: [PATCH 017/128] test

---
 dev.py                               |  4 +++
 torchvision/csrc/cpu/video/Video.cpp | 38 +++++++++++++++-------------
 torchvision/csrc/cpu/video/Video.h   |  3 ++-
 3 files changed, 26 insertions(+), 19 deletions(-)
 create mode 100644 dev.py

diff --git a/dev.py b/dev.py
new file mode 100644
index 00000000000..9a66f867837
--- /dev/null
+++ b/dev.py
@@ -0,0 +1,4 @@
+import torch, torchvision
+video_path = "/home/bjuncek/work/video_reader_benchmark/videos/R6llTwEh07w.mp4"
+video = torch.classes.torchvision.Video(video_path, "video:0", True)
+video.next("video")
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 997ba537e9f..05d36b24c7a 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -14,19 +14,19 @@ using namespace ffmpeg;
 
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension
-#ifdef _WIN32
-#if PY_MAJOR_VERSION < 3
-PyMODINIT_FUNC init_video_reader(void) {
-  // No need to do anything.
-  return NULL;
-}
-#else
-PyMODINIT_FUNC PyInit_video_reader(void) {
-  // No need to do anything.
-  return NULL;
-}
-#endif
-#endif
+// #ifdef _WIN32
+// #if PY_MAJOR_VERSION < 3
+// PyMODINIT_FUNC init_video_reader(void) {
+//   // No need to do anything.
+//   return NULL;
+// }
+// #else
+// PyMODINIT_FUNC PyInit_video_reader(void) {
+//   // No need to do anything.
+//   return NULL;
+// }
+// #endif
+// #endif
 
 
 const size_t decoderTimeoutMs = 600000;
@@ -114,6 +114,7 @@ void Video::_getDecoderParams(
     params.headerOnly = getPtsOnly != 0;
     params.seekAccuracy = seekFrameMarginUs;
     params.startOffset = videoStartUs;
+    params.endOffset = std::numeric_limits<long>::infinity();
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
@@ -126,6 +127,10 @@ void Video::_getDecoderParams(
         MediaFormat videoFormat(0, (long) -2);
         videoFormat.type = TYPE_VIDEO;
         videoFormat.format.video.format = defaultVideoPixelFormat;
+        videoFormat.format.video.width = 0;
+        videoFormat.format.video.height = 0;
+        videoFormat.format.video.minDimension = 0;
+        videoFormat.format.video.maxDimension = 0;
         params.formats.insert(videoFormat);
 
         // there is no clear way on how to use other formats- todo later
@@ -207,11 +212,8 @@ Video::Video(
     logMessage = videoPath;
     
 
-    // get a decoder
-    bool succeeded;
-
     cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
-          << "] has started";
+          << "] has started \n";
     
 
     
@@ -220,7 +222,6 @@ Video::Video(
     std::vector<double> audioTB, videoTB, ccTB, subsTB;
 
     // calback and metadata defined in struct
-    callback = nullptr;
     succeeded = decoder.init(params, std::move(callback), &metadata);
     if (succeeded) {
         for (const auto& header : metadata) {
@@ -320,6 +321,7 @@ int64_t Video::Next(std::string stream=""){
     if (res == 0){
         return 0;
     }
+    out.payload.reset();
     
     return 1;
 }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 4e8d10f5d8f..bdc0cb85267 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -28,6 +28,7 @@ using namespace ffmpeg;
 
 struct Video : torch::CustomClassHolder {
     bool any_frame=false; // add this to input parameters
+    bool succeeded=false; // this is decoder init stuff
     std::tuple<std::string, int64_t> current_stream;
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
@@ -46,7 +47,7 @@ struct Video : torch::CustomClassHolder {
         SyncDecoder decoder;
         DecoderParameters params;
 
-        DecoderInCallback callback;
+        DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
     // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();

From d4452d9811ded9bcd3469dcea2ddeda564d6dfdf Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 27 Aug 2020 04:32:44 -0500
Subject: [PATCH 018/128] Merge change

---
 torchvision/csrc/cpu/video/Video.cpp | 10 +++++-----
 torchvision/csrc/cpu/video/Video.h   |  4 ----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 6c8a6474c7a..4cb735aae72 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -195,11 +195,11 @@ void Video::_getDecoderParams(
         // MediaFormat ccFormat((double) 0, (long) -2);
         // ccFormat.type = TYPE_CC;
 
-    }
+}
 
     // else use the stream using the correct parsing technique
 
-} // _get decoder params
+// } // _get decoder params
 
 
 Video::Video(
@@ -213,9 +213,9 @@ Video::Video(
 
     Video::_getDecoderParams(
         0,      // video start
-        false,  //headerOnly
-        get<0>(current_stream),
-        long(-21,     // stream_id parsed from info above
+        0,  //headerOnly
+        get<0>(current_stream), // stream
+        long(-1),     // stream_id parsed from info above
         true    // read all streams
     );
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 03ebe3c2ecb..30e4822fd7e 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -24,8 +24,6 @@
 using namespace ffmpeg;
 
 
-<<<<<<< HEAD
-
 
 struct Video : torch::CustomClassHolder {
     bool any_frame=false; // add this to input parameters
@@ -56,8 +54,6 @@ struct Video : torch::CustomClassHolder {
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:
         // AV container type (check in decoder for exact type)
-        
-        
 
         // int64_t SecToStream(double ts); // TODO: add stream type
         // double StreamToSec(int64_t pts); // TODO: add stream type

From 56a84c92ac73d2cee770449fcc30d85ce3939c28 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 28 Aug 2020 05:31:24 -0500
Subject: [PATCH 019/128] formatting parameters to avoid the segfault

---
 torchvision/csrc/cpu/video/Video.cpp | 115 +++++++++------------------
 torchvision/csrc/cpu/video/Video.h   |   2 -
 2 files changed, 37 insertions(+), 80 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4cb735aae72..e95c1b376de 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -31,7 +31,6 @@ using namespace ffmpeg;
 
 
 const size_t decoderTimeoutMs = 600000;
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
 // error, small value 100us won't be enough to select the next frame, but enough
@@ -109,97 +108,57 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         std::string stream,
         long stream_id=-1,
-
         bool all_streams=false,
         double seekFrameMarginUs=10){
 
-    params.headerOnly = getPtsOnly != 0;
-    params.seekAccuracy = seekFrameMarginUs;
+    
+    params.timeoutMs = decoderTimeoutMs;
     params.startOffset = videoStartUs;
-    params.endOffset = std::numeric_limits<long>::infinity();
+    params.seekAccuracy = 10;
+    params.headerOnly = false;
 
-    params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
     if (all_streams == true){
-        MediaFormat audioFormat((long) -2);
-        audioFormat.type = TYPE_AUDIO;
-        audioFormat.format.audio.format = defaultAudioSampleFormat;
-        params.formats.insert(audioFormat);
-
-        MediaFormat videoFormat(0, (long) -2);
-        videoFormat.type = TYPE_VIDEO;
-        videoFormat.format.video.format = defaultVideoPixelFormat;
-        videoFormat.format.video.width = 0;
-        videoFormat.format.video.height = 0;
-        videoFormat.format.video.minDimension = 0;
-        videoFormat.format.video.maxDimension = 0;
-        params.formats.insert(videoFormat);
-
-        // there is no clear way on how to use other formats- todo later
-        MediaFormat subtitleFormat(char('0'), long(-2));
-        subtitleFormat.type = TYPE_SUBTITLE;
-        params.formats.insert(subtitleFormat);
-
-        MediaFormat ccFormat(double(0), long(-2));
-        ccFormat.type = TYPE_CC;
-        params.formats.insert(ccFormat);
-
+        MediaFormat format;
+        format.stream = -2;
+        format.type = TYPE_AUDIO;
+        params.formats.insert(format);
+
+        format.type = TYPE_VIDEO;
+        format.stream = -2;
+        format.format.video.width = 0;
+        format.format.video.height = 0;
+        format.format.video.cropImage = 0;
+        params.formats.insert(format);
+
+        format.type = TYPE_SUBTITLE;
+        format.stream = -2;
+        params.formats.insert(format);
+
+        format.type = TYPE_CC;
+        format.stream = -2;
+        params.formats.insert(format);
     } else{
+        // parse stream type
+        MediaType stream_type = parse_type_to_mt(stream);
+        
         // TODO: reset params.formats
         std::set<MediaFormat> formats;
         params.formats = formats;
-        MediaType stream_type = parse_type_to_mt(stream);
-        // now here is a mindfuck 
-        // - there is no way to construct mediaformat by type so we actually
-        // need an endless if/then
-        switch(stream_type) {
-            case TYPE_VIDEO:
-            {
-                MediaFormat videoFormat(0, (long) stream_id);
-                videoFormat.type = TYPE_VIDEO;
-                videoFormat.format.video.format = defaultVideoPixelFormat;
-                params.formats.insert(videoFormat);
-                break;
-            }
-            case TYPE_AUDIO:
-            {        
-                MediaFormat audioFormat((long) stream_id);
-                audioFormat.type = TYPE_AUDIO;
-                audioFormat.format.audio.format = defaultAudioSampleFormat;
-                params.formats.insert(audioFormat);
-                break;
-            }
-            // case TYPE_CC:
-            //     MediaFormat subtitleFormat(char('0'), long(stream_id));
-            //     subtitleFormat.type = TYPE_SUBTITLE;
-            //     params.formats.insert(subtitleFormat);
-            //     break;
-            default:
-            {
-                MediaFormat videoFormat(0, (long) -1);
-                videoFormat.type = TYPE_VIDEO;
-                videoFormat.format.video.format = defaultVideoPixelFormat;
-                params.formats.insert(videoFormat);
-                break;
-            }
+        // Define new format
+        MediaFormat format;
+        format.type = stream_type;
+        format.stream = stream_id;
+        if (stream_type == TYPE_VIDEO){
+            format.format.video.width = 0;
+            format.format.video.height = 0;
+            format.format.video.cropImage = 0;
         }
-
+        params.formats.insert(format);
     }
 
-
-
-        // there is no clear way on how to use other formats- todo later
-        // MediaFormat subtitleFormat("0", (long) -2);
-        // subtitleFormat.type = TYPE_SUBTITLE;
-        // MediaFormat ccFormat((double) 0, (long) -2);
-        // ccFormat.type = TYPE_CC;
-
-}
-
-    // else use the stream using the correct parsing technique
-
-// } // _get decoder params
+} // _get decoder params
 
 
 Video::Video(
@@ -207,6 +166,7 @@ Video::Video(
     std::string stream, 
     bool isReadFile) {
 
+
     //parse stream information
     current_stream = _parseStream(stream);
     // note that in the initial version we want to get all streams
@@ -333,7 +293,6 @@ int64_t Video::Next(std::string stream=""){
 
     DecoderOutputMessage out;
     int64_t res = decoder.decode(&out, decoderTimeoutMs);
-
     if (res == 0){
         return 0;
     }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 30e4822fd7e..5ba2c2e8f04 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -48,8 +48,6 @@ struct Video : torch::CustomClassHolder {
 
         DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
-    // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
-        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:

From 36cc8f133011f48877c6abf50eff0b90622be808 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Aug 2020 06:24:24 -0500
Subject: [PATCH 020/128] next now works on a video

---
 torchvision/csrc/cpu/video/Video.cpp | 76 +++++++++++++++++++++++-----
 torchvision/csrc/cpu/video/Video.h   | 10 +++-
 2 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index e95c1b376de..8c7975d09a2 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -38,6 +38,40 @@ const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 const size_t timeBaseJitterUs = 100;
 
 
+// returns number of written bytes
+template <typename T>
+size_t fillTensorList(DecoderOutputMessage& msgs,
+                      torch::Tensor& frame,
+                      torch::Tensor& framePts) {
+    // if (!msg) {
+    //     return 0;
+    // }
+    // set up PTS data
+    const auto& msg = msgs;
+
+    float* framePtsData = framePts.data_ptr<float>();
+    
+    float pts_s = float(float(msg.header.pts) * 1e-6);
+    framePtsData[0] =  pts_s;
+    
+    T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
+
+    
+    if (frameData) {
+        auto sizeInBytes = msg.payload->length();
+        memcpy(frameData, msg.payload->data(), sizeInBytes);
+    }
+  return sizeof(T);
+}
+
+size_t fillVideoTensor(
+    DecoderOutputMessage& msgs,
+    torch::Tensor& videoFrame,
+    torch::Tensor& videoFramePts) {
+  return fillTensorList<uint8_t>(msgs, videoFrame, videoFramePts);
+}
+
+
 std::string parse_type_to_string(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
@@ -196,6 +230,7 @@ Video::Video(
     std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
     std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
     std::vector<double> audioTB, videoTB, ccTB, subsTB;
+    
 
     // calback and metadata defined in struct
     succeeded = decoder.init(params, std::move(callback), &metadata);
@@ -211,21 +246,18 @@ Video::Video(
 
 
             if (header.format.type == TYPE_VIDEO) {
+                videoMetadata = header;
                 videoFPS.push_back(fps);
                 videoDuration.push_back(duration);
-                videoTB.push_back(timeBase);
             } else if (header.format.type == TYPE_AUDIO) {
                 audioFPS.push_back(fps);
                 audioDuration.push_back(duration);
-                audioTB.push_back(timeBase);
             } else if (header.format.type == TYPE_CC){
                 ccFPS.push_back(fps);
                 ccDuration.push_back(duration);
-                ccTB.push_back(timeBase);
             } else if (header.format.type == TYPE_SUBTITLE){
                 subsFPS.push_back(fps);
                 subsDuration.push_back(duration);
-                subsTB.push_back(timeBase);
             };
         }
 
@@ -233,7 +265,6 @@ Video::Video(
 
     streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
-    streamTimeBase.insert({{"video", videoTB}, {"audio", audioTB}});
 } //video
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
@@ -281,24 +312,41 @@ int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
     
     bool succeeded = decoder.init(params, std::move(callback), &metadata);
     if (succeeded){
+        // initialize the class variables and retrurn
+        video_any_frame = any_frame;
+        seekTS = ts; 
         return 0;
     }
-
     return 1;
-
 }
 
+torch::List<torch::Tensor> Video::Next(std::string stream=""){
 
-int64_t Video::Next(std::string stream=""){
+    size_t expectedWrittenBytes = 0;
+    torch::Tensor videoFramePts = torch::zeros({1}, torch::kFloat);
+
+    const auto& format = videoMetadata.format.format.video;
+    int outHeight = format.height;
+    int outWidth = format.width;
+    int numChannels = 3;
+    
+    torch::Tensor videoFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+    expectedWrittenBytes = outHeight * outWidth * numChannels;
+    std::cout << expectedWrittenBytes;
 
     DecoderOutputMessage out;
-    int64_t res = decoder.decode(&out, decoderTimeoutMs);
-    if (res == 0){
-        return 0;
+    // if not in seek mode or only looking at the keyframes, 
+    // return the immediate next frame 
+    if ((seekTS == -1) || (video_any_frame == false)) {
+        int64_t res = decoder.decode(&out, decoderTimeoutMs);
+        auto numberWrittenBytes = fillVideoTensor(out, videoFrame, videoFramePts);
+        out.payload.reset();
     }
-    out.payload.reset();
-    
-    return 1;
+
+    torch::List<torch::Tensor> result;
+    result.push_back(videoFrame);
+    result.push_back(videoFramePts);
+    return result;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 5ba2c2e8f04..0056b5b81a1 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -26,18 +26,23 @@ using namespace ffmpeg;
 
 
 struct Video : torch::CustomClassHolder {
-    bool any_frame=false; // add this to input parameters
+    bool video_any_frame=false; // add this to input parameters
     bool succeeded=false; // this is decoder init stuff
+    // this acts as a flag - if it's not set, next function simply
+    // retruns the next frame. If it's set, we look at the global seek
+    // time in comination with any_frame settings
+    double seekTS=-1; 
     std::tuple<std::string, int64_t> current_stream;
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
+    DecoderMetadata videoMetadata;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
         std::tuple<std::string, int64_t> getCurrentStream() const;
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
         int64_t Seek(double ts, std::string stream, bool any_frame);
-        int64_t Next(std::string stream); //torch::List<torch::Tensor>
+        torch::List<torch::Tensor> Next(std::string stream); //
 
     private:
         void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
@@ -49,6 +54,7 @@ struct Video : torch::CustomClassHolder {
         DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
         
+        
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:
         // AV container type (check in decoder for exact type)

From 780bef942e45699287f291af8a78645695f12238 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Aug 2020 13:02:58 -0500
Subject: [PATCH 021/128] make size of the output tensor format dependent

---
 torchvision/csrc/cpu/video/Video.cpp | 41 ++++++++++++++++------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 8c7975d09a2..10a7ae609b5 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -209,7 +209,7 @@ Video::Video(
         0,      // video start
         0,  //headerOnly
         get<0>(current_stream), // stream
-        long(-1),     // stream_id parsed from info above
+        long(-1),     // stream_id parsed from info above change to -2
         true    // read all streams
     );
 
@@ -322,30 +322,37 @@ int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
 
 torch::List<torch::Tensor> Video::Next(std::string stream=""){
 
-    size_t expectedWrittenBytes = 0;
-    torch::Tensor videoFramePts = torch::zeros({1}, torch::kFloat);
-
-    const auto& format = videoMetadata.format.format.video;
-    int outHeight = format.height;
-    int outWidth = format.width;
-    int numChannels = 3;
-    
-    torch::Tensor videoFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-    expectedWrittenBytes = outHeight * outWidth * numChannels;
-    std::cout << expectedWrittenBytes;
 
+    // first decode the frame
     DecoderOutputMessage out;
+    int64_t res = decoder.decode(&out, decoderTimeoutMs);
+    auto header = out.header;
+    const auto& format = header.format;
+
+    // then initialize the output variables based on type
+    size_t expectedWrittenBytes = 0;
+    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+
+    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
+    if (format.type == TYPE_VIDEO) {
+        int outHeight = format.format.video.height;
+        int outWidth = format.format.video.width;
+        int numChannels = 3;
+        outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+        expectedWrittenBytes = outHeight * outWidth * numChannels;
+        std::cout << expectedWrittenBytes;
+    }
+    
     // if not in seek mode or only looking at the keyframes, 
     // return the immediate next frame 
-    if ((seekTS == -1) || (video_any_frame == false)) {
-        int64_t res = decoder.decode(&out, decoderTimeoutMs);
-        auto numberWrittenBytes = fillVideoTensor(out, videoFrame, videoFramePts);
+    if ((seekTS == -1) || (video_any_frame == false)) {            
+        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
         out.payload.reset();
     }
 
     torch::List<torch::Tensor> result;
-    result.push_back(videoFrame);
-    result.push_back(videoFramePts);
+    result.push_back(outFrame);
+    result.push_back(framePTS);
     return result;
 }
 

From 4716512c167926a144d4a852dbee4a0612c7fd0b Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Aug 2020 13:52:57 -0500
Subject: [PATCH 022/128] Make next work on audio stream only as well

---
 torchvision/csrc/cpu/video/Video.cpp | 46 ++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 10a7ae609b5..d22b4e2e1df 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -71,6 +71,13 @@ size_t fillVideoTensor(
   return fillTensorList<uint8_t>(msgs, videoFrame, videoFramePts);
 }
 
+size_t fillAudioTensor(
+    DecoderOutputMessage& msgs,
+    torch::Tensor& audioFrame,
+    torch::Tensor& audioFramePts) {
+  return fillTensorList<float>(msgs, audioFrame, audioFramePts);
+}
+
 
 std::string parse_type_to_string(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
@@ -203,8 +210,7 @@ Video::Video(
 
     //parse stream information
     current_stream = _parseStream(stream);
-    // note that in the initial version we want to get all streams
-
+    // note that in the initial call we want to get all streams
     Video::_getDecoderParams(
         0,      // video start
         0,  //headerOnly
@@ -221,11 +227,6 @@ Video::Video(
     logType = "file";
     logMessage = videoPath;
     
-
-    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
-          << "] has started \n";
-    
-
     
     std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
     std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
@@ -244,7 +245,6 @@ Video::Video(
             cout << "Decoding stream of" << header.format.type;
             cout << "duration " << duration << " tb" << timeBase << " " << double(header.num) << " " <<double(header.num);
 
-
             if (header.format.type == TYPE_VIDEO) {
                 videoMetadata = header;
                 videoFPS.push_back(fps);
@@ -262,9 +262,20 @@ Video::Video(
         }
 
     }
-
     streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+
+
+    // set current stream again
+    Video::_getDecoderParams(
+        0,      // video start
+        0,  //headerOnly
+        get<0>(current_stream), // stream
+        long(-1),     // stream_id parsed from info above change to -2
+        false    // read all streams
+    );
+    // calback and metadata defined in Video.h
+    succeeded = decoder.init(params, std::move(callback), &metadata);
 } //video
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
@@ -341,12 +352,27 @@ torch::List<torch::Tensor> Video::Next(std::string stream=""){
         outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
         expectedWrittenBytes = outHeight * outWidth * numChannels;
         std::cout << expectedWrittenBytes;
+    } else if (format.type == TYPE_AUDIO) {
+        int outAudioChannels = format.format.audio.channels;
+        int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
+        int frameSizeTotal = out.payload->length();
+        
+        CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+        int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+
+        outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+
+        expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
     
     // if not in seek mode or only looking at the keyframes, 
     // return the immediate next frame 
     if ((seekTS == -1) || (video_any_frame == false)) {            
-        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+        if (format.type == TYPE_VIDEO) {
+            auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+        } else {
+            auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+        }
         out.payload.reset();
     }
 

From 2a0c73f4e320e5ef5ff61b091e65923e5d3f8362 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 2 Sep 2020 05:56:09 -0500
Subject: [PATCH 023/128] refactoring the _setCurrentStream param

---
 torchvision/csrc/cpu/video/Video.cpp | 67 +++++++++++++++-------------
 torchvision/csrc/cpu/video/Video.h   |  3 +-
 2 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d22b4e2e1df..3b23d7d744b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -145,7 +145,7 @@ std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
 
 
 void Video::_getDecoderParams(
-        int64_t videoStartUs,
+        int64_t videoStartS,
         int64_t getPtsOnly,
         std::string stream,
         long stream_id=-1,
@@ -153,6 +153,8 @@ void Video::_getDecoderParams(
         double seekFrameMarginUs=10){
 
     
+    int64_t videoStartUs = int64_t(videoStartS * 1e6);
+
     params.timeoutMs = decoderTimeoutMs;
     params.startOffset = videoStartUs;
     params.seekAccuracy = 10;
@@ -214,7 +216,7 @@ Video::Video(
     Video::_getDecoderParams(
         0,      // video start
         0,  //headerOnly
-        get<0>(current_stream), // stream
+        get<0>(current_stream), // stream info - remove that
         long(-1),     // stream_id parsed from info above change to -2
         true    // read all streams
     );
@@ -241,10 +243,6 @@ Video::Video(
             double timeBase = double(header.num) / double(header.den);
             double duration = double(header.duration) * 1e-6; // * timeBase;
 
-
-            cout << "Decoding stream of" << header.format.type;
-            cout << "duration " << duration << " tb" << timeBase << " " << double(header.num) << " " <<double(header.num);
-
             if (header.format.type == TYPE_VIDEO) {
                 videoMetadata = header;
                 videoFPS.push_back(fps);
@@ -266,17 +264,40 @@ Video::Video(
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
 
 
-    // set current stream again
-    Video::_getDecoderParams(
-        0,      // video start
+    // // set current stream again
+    // Video::_getDecoderParams(
+    //     0,      // video start
+    //     0,  //headerOnly
+    //     get<0>(current_stream), // stream
+    //     long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
+    //     false    // read all streams
+    // );
+
+    // succeeded = decoder.init(params, std::move(callback), &metadata);
+    succeeded = Video::_setCurrentStream(stream);
+    std::cout << "\nDecoder inited with: " << succeeded;
+} //video
+
+// why is this not woriking? 
+bool Video::_setCurrentStream(std::string stream){  
+    current_stream = _parseStream(stream);
+    double ts = 0;
+    if (seekTS > 0) {
+        ts = seekTS;
+    }
+
+    _getDecoderParams(
+        ts,  // video start
         0,  //headerOnly
         get<0>(current_stream), // stream
-        long(-1),     // stream_id parsed from info above change to -2
+        long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
         false    // read all streams
     );
+
     // calback and metadata defined in Video.h
-    succeeded = decoder.init(params, std::move(callback), &metadata);
-} //video
+    return(decoder.init(params, std::move(callback), &metadata));
+
+}
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
     return current_stream;
@@ -305,27 +326,13 @@ std::vector<double> Video::getDuration(std::string stream) const{
 }
 
 int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
-    if (stream.empty()){
-        stream = get<0>(current_stream);
-    }
-    auto stream_tpl = _parseStream(stream);
-    // check if the stream exists
 
-    // convert time to microseconds and cast to unsigned long int
-    int64_t ts_out = int64_t(ts * 1e6);
+    // initialize the class variables and retrurn
+    video_any_frame = any_frame;
+    seekTS = ts; 
 
-    Video::_getDecoderParams(
-        ts_out,
-        0, // we're in full get frame mode
-        get<0>(stream_tpl),
-        get<1>(stream_tpl),
-        false);
-    
-    bool succeeded = decoder.init(params, std::move(callback), &metadata);
+    succeeded = Video::_setCurrentStream(stream);
     if (succeeded){
-        // initialize the class variables and retrurn
-        video_any_frame = any_frame;
-        seekTS = ts; 
         return 0;
     }
     return 1;
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 0056b5b81a1..5a0b706d1aa 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -45,7 +45,8 @@ struct Video : torch::CustomClassHolder {
         torch::List<torch::Tensor> Next(std::string stream); //
 
     private:
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        bool _setCurrentStream(std::string stream="video");
         std::map<std::string, std::vector<double>> streamTimeBase;
 
         SyncDecoder decoder;

From aa775c2588eb9fff24fc4507648a2e18f68d1206 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 3 Sep 2020 05:36:46 -0500
Subject: [PATCH 024/128] Fixing the last frame return and sensor

---
 torchvision/csrc/cpu/video/Video.cpp | 107 ++++++++++++++++-----------
 video_reader.todo                    |  30 ++++----
 2 files changed, 77 insertions(+), 60 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 3b23d7d744b..3b8fd8c5fc4 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -119,7 +119,7 @@ MediaType parse_type_to_mt(const std::string& stream_string) {
       "Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
+std::tuple<std::string, long> _parseStream(const std::string& streamString){
     TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
     static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
     std::smatch match;
@@ -130,7 +130,7 @@ std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
     
     std::string type_ = "video";
     type_ = parse_type_to_string(match[1].str());
-    int64_t index_ = -1;
+    long index_ = -1;
     if (match[2].matched) {
         try {
         index_ = c10::stoi(match[2].str());
@@ -263,19 +263,12 @@ Video::Video(
     streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
 
-
-    // // set current stream again
-    // Video::_getDecoderParams(
-    //     0,      // video start
-    //     0,  //headerOnly
-    //     get<0>(current_stream), // stream
-    //     long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
-    //     false    // read all streams
-    // );
-
-    // succeeded = decoder.init(params, std::move(callback), &metadata);
     succeeded = Video::_setCurrentStream(stream);
-    std::cout << "\nDecoder inited with: " << succeeded;
+    LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
+    if (long(get<1>(current_stream)) != -1) {
+        LOG(INFO) << "Stream index set to " << long(get<1>(current_stream) <<
+        ". If you encounter trouble, consider switching it to automatic stream discovery.\n";
+    }
 } //video
 
 // why is this not woriking? 
@@ -340,47 +333,71 @@ int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
 
 torch::List<torch::Tensor> Video::Next(std::string stream=""){
 
+    bool switched = false;
+    if ((!stream.empty()) && (_parseStream(stream) != current_stream)){
+        succeeded = Video::_setCurrentStream(stream);
+        if (succeeded){
+            cout << "Switching the stream to new one in next ya'll \n";
+            switched = true;
+        }
+    }
+
+    // if failing to decode simply return 0 (note, maybe 
+    // raise an exeption otherwise)
+    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
     // first decode the frame
     DecoderOutputMessage out;
     int64_t res = decoder.decode(&out, decoderTimeoutMs);
-    auto header = out.header;
-    const auto& format = header.format;
+    if (res == 0){
 
-    // then initialize the output variables based on type
-    size_t expectedWrittenBytes = 0;
-    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+        auto header = out.header;
+        const auto& format = header.format;
 
-    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-    if (format.type == TYPE_VIDEO) {
-        int outHeight = format.format.video.height;
-        int outWidth = format.format.video.width;
-        int numChannels = 3;
-        outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-        expectedWrittenBytes = outHeight * outWidth * numChannels;
-        std::cout << expectedWrittenBytes;
-    } else if (format.type == TYPE_AUDIO) {
-        int outAudioChannels = format.format.audio.channels;
-        int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
-        int frameSizeTotal = out.payload->length();
-        
-        CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-        int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+        if (switched == true) {
+            cout << "now looking at " << format.type <<" \n";
+        }
 
-        outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+        // then initialize the output variables based on type
+        size_t expectedWrittenBytes = 0;
 
-        expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
-    }
-    
-    // if not in seek mode or only looking at the keyframes, 
-    // return the immediate next frame 
-    if ((seekTS == -1) || (video_any_frame == false)) {            
         if (format.type == TYPE_VIDEO) {
-            auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-        } else {
-            auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+            int outHeight = format.format.video.height;
+            int outWidth = format.format.video.width;
+            int numChannels = 3;
+            outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+            expectedWrittenBytes = outHeight * outWidth * numChannels;
+            std::cout << expectedWrittenBytes;
+        } else if (format.type == TYPE_AUDIO) {
+            int outAudioChannels = format.format.audio.channels;
+            int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
+            int frameSizeTotal = out.payload->length();
+            
+            CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+            int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+
+            outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+
+            expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
+        }
+        
+        std::cout << "Successfully allocated tensors to the dimension \n" ;
+        // if not in seek mode or only looking at the keyframes, 
+        // return the immediate next frame 
+        if ((seekTS == -1) || (video_any_frame == false)) {   
+
+            std::cout << "In non-seek mode stuff is happening \n";         
+            if (format.type == TYPE_VIDEO) {
+                auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+            } else {
+                auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+            }
+            out.payload.reset();
         }
-        out.payload.reset();
+    }
+    else {
+        LOG(ERROR) << "Decoder run into a last iteration or has failed";
     }
 
     torch::List<torch::Tensor> result;
diff --git a/video_reader.todo b/video_reader.todo
index 2a01bbde4b8..f51dab1c8ef 100644
--- a/video_reader.todo
+++ b/video_reader.todo
@@ -1,15 +1,15 @@
-The new API:
-    ☐ the c++ extension is going to live in torchvision/csrc/cpu/video
-    ☐ modification of the build needs to go to setup.py
-    ☐ torchvision/io/_video something needs to happen somehow
-
-Tests changes:
-    ☐ test/test_io.py
-    ☐ test/test_video_reader.py (change to test video api)
-
-
-
-Implementation:
-    ☐ Datatype for strem
-    ☐ Datatype for container
-    ☐ Do I use tensor as a type in metadata 
\ No newline at end of file
+Documented edgecases that don't work:
+    ☐ seeking with anyframe=True
+    ✔ last frame segfaults - exit cleanly @started(20-09-02 10:44) @done(20-09-02 11:07) @lasted(23m1s)
+    ✔ switching modalities in the subsequent calls to next() @done(20-09-03 05:33)
+    ```
+    video.next("video:0")
+    video.next("video:0")
+    video.next("audio:0")
+    ```
+
+
+Random todo's:
+    ✔ add check for the current stream @done(20-09-02 06:37)
+    ☐ ensure warning if stream is out of bounds
+    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
\ No newline at end of file

From bdb62bf0cb2c634f6a37340f524cb48143c132de Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 3 Sep 2020 06:01:55 -0500
Subject: [PATCH 025/128] todo docs

---
 video_reader.todo | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/video_reader.todo b/video_reader.todo
index f51dab1c8ef..cc9bc343f23 100644
--- a/video_reader.todo
+++ b/video_reader.todo
@@ -12,4 +12,8 @@ Documented edgecases that don't work:
 Random todo's:
     ✔ add check for the current stream @done(20-09-02 06:37)
     ☐ ensure warning if stream is out of bounds
-    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
\ No newline at end of file
+    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
+    ☐ can we make this an iterable?
+    ☐ destructors
+    ☐ adding tests to test.py
+    ☐ thorough checking for memory leaks
\ No newline at end of file

From 632842c0461c81ad510c008132c53c2c2ef0fca0 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 8 Sep 2020 06:12:43 -0500
Subject: [PATCH 026/128] Formatting

---
 torchvision/csrc/cpu/video/Video.cpp | 573 +++++++++++++--------------
 torchvision/csrc/cpu/video/Video.h   |  12 +-
 2 files changed, 284 insertions(+), 301 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 3b8fd8c5fc4..d5b5e4f0ed4 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -1,17 +1,14 @@
 
-# include "Video.h"
-#include <torch/script.h>
+#include "Video.h"
 #include <c10/util/Logging.h>
-#include "sync_decoder.h"
-#include "sync_decoder.h"
-#include "memory_buffer.h"
+#include <torch/script.h>
 #include "defs.h"
-
+#include "memory_buffer.h"
+#include "sync_decoder.h"
 
 using namespace std;
 using namespace ffmpeg;
 
-
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension
 // #ifdef _WIN32
@@ -28,8 +25,6 @@ using namespace ffmpeg;
 // #endif
 // #endif
 
-
-
 const size_t decoderTimeoutMs = 600000;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
@@ -37,30 +32,26 @@ const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // to compensate rounding error due to the multiple conversions.
 const size_t timeBaseJitterUs = 100;
 
-
 // returns number of written bytes
 template <typename T>
-size_t fillTensorList(DecoderOutputMessage& msgs,
-                      torch::Tensor& frame,
-                      torch::Tensor& framePts) {
-    // if (!msg) {
-    //     return 0;
-    // }
-    // set up PTS data
-    const auto& msg = msgs;
-
-    float* framePtsData = framePts.data_ptr<float>();
-    
-    float pts_s = float(float(msg.header.pts) * 1e-6);
-    framePtsData[0] =  pts_s;
-    
-    T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-
-    
-    if (frameData) {
-        auto sizeInBytes = msg.payload->length();
-        memcpy(frameData, msg.payload->data(), sizeInBytes);
-    }
+size_t fillTensorList(
+    DecoderOutputMessage& msgs,
+    torch::Tensor& frame,
+    torch::Tensor& framePts) {
+  // set up PTS data
+  const auto& msg = msgs;
+
+  float* framePtsData = framePts.data_ptr<float>();
+
+  float pts_s = float(float(msg.header.pts) * 1e-6);
+  framePtsData[0] = pts_s;
+
+  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
+
+  if (frameData) {
+    auto sizeInBytes = msg.payload->length();
+    memcpy(frameData, msg.payload->data(), sizeInBytes);
+  }
   return sizeof(T);
 }
 
@@ -78,7 +69,6 @@ size_t fillAudioTensor(
   return fillTensorList<float>(msgs, audioFrame, audioFramePts);
 }
 
-
 std::string parse_type_to_string(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
@@ -95,8 +85,7 @@ std::string parse_type_to_string(const std::string& stream_string) {
   if (device != types.end()) {
     return device->first;
   }
-  AT_ERROR(
-      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+  AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
 MediaType parse_type_to_mt(const std::string& stream_string) {
@@ -115,297 +104,291 @@ MediaType parse_type_to_mt(const std::string& stream_string) {
   if (device != types.end()) {
     return device->second;
   }
-  AT_ERROR(
-      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+  AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::tuple<std::string, long> _parseStream(const std::string& streamString){
-    TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
-    static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
-    std::smatch match;
-
-    TORCH_CHECK(
-        std::regex_match(streamString, match, regex),
-        "Invalid stream string: '", streamString, "'");
-    
-    std::string type_ = "video";
-    type_ = parse_type_to_string(match[1].str());
-    long index_ = -1;
-    if (match[2].matched) {
-        try {
-        index_ = c10::stoi(match[2].str());
-        } catch (const std::exception &) {
-        AT_ERROR(
-            "Could not parse device index '", match[2].str(),
-            "' in device string '", streamString, "'");
-        }
+std::tuple<std::string, long> _parseStream(const std::string& streamString) {
+  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
+  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
+  std::smatch match;
+
+  TORCH_CHECK(
+      std::regex_match(streamString, match, regex),
+      "Invalid stream string: '",
+      streamString,
+      "'");
+
+  std::string type_ = "video";
+  type_ = parse_type_to_string(match[1].str());
+  long index_ = -1;
+  if (match[2].matched) {
+    try {
+      index_ = c10::stoi(match[2].str());
+    } catch (const std::exception&) {
+      AT_ERROR(
+          "Could not parse device index '",
+          match[2].str(),
+          "' in device string '",
+          streamString,
+          "'");
     }
-    return std::make_tuple(type_, index_);
+  }
+  return std::make_tuple(type_, index_);
 }
 
-
 void Video::_getDecoderParams(
-        int64_t videoStartS,
-        int64_t getPtsOnly,
-        std::string stream,
-        long stream_id=-1,
-        bool all_streams=false,
-        double seekFrameMarginUs=10){
-
-    
-    int64_t videoStartUs = int64_t(videoStartS * 1e6);
-
-    params.timeoutMs = decoderTimeoutMs;
-    params.startOffset = videoStartUs;
-    params.seekAccuracy = 10;
-    params.headerOnly = false;
-
-    params.preventStaleness = false;  // not sure what this is about
-
-    if (all_streams == true){
-        MediaFormat format;
-        format.stream = -2;
-        format.type = TYPE_AUDIO;
-        params.formats.insert(format);
-
-        format.type = TYPE_VIDEO;
-        format.stream = -2;
-        format.format.video.width = 0;
-        format.format.video.height = 0;
-        format.format.video.cropImage = 0;
-        params.formats.insert(format);
-
-        format.type = TYPE_SUBTITLE;
-        format.stream = -2;
-        params.formats.insert(format);
-
-        format.type = TYPE_CC;
-        format.stream = -2;
-        params.formats.insert(format);
-    } else{
-        // parse stream type
-        MediaType stream_type = parse_type_to_mt(stream);
-        
-        // TODO: reset params.formats
-        std::set<MediaFormat> formats;
-        params.formats = formats;
-        // Define new format
-        MediaFormat format;
-        format.type = stream_type;
-        format.stream = stream_id;
-        if (stream_type == TYPE_VIDEO){
-            format.format.video.width = 0;
-            format.format.video.height = 0;
-            format.format.video.cropImage = 0;
-        }
-        params.formats.insert(format);
+    int64_t videoStartS,
+    int64_t getPtsOnly,
+    std::string stream,
+    long stream_id = -1,
+    bool all_streams = false,
+    double seekFrameMarginUs = 10) {
+  int64_t videoStartUs = int64_t(videoStartS * 1e6);
+
+  params.timeoutMs = decoderTimeoutMs;
+  params.startOffset = videoStartUs;
+  params.seekAccuracy = 10;
+  params.headerOnly = false;
+
+  params.preventStaleness = false; // not sure what this is about
+
+  if (all_streams == true) {
+    MediaFormat format;
+    format.stream = -2;
+    format.type = TYPE_AUDIO;
+    params.formats.insert(format);
+
+    format.type = TYPE_VIDEO;
+    format.stream = -2;
+    format.format.video.width = 0;
+    format.format.video.height = 0;
+    format.format.video.cropImage = 0;
+    params.formats.insert(format);
+
+    format.type = TYPE_SUBTITLE;
+    format.stream = -2;
+    params.formats.insert(format);
+
+    format.type = TYPE_CC;
+    format.stream = -2;
+    params.formats.insert(format);
+  } else {
+    // parse stream type
+    MediaType stream_type = parse_type_to_mt(stream);
+
+    // TODO: reset params.formats
+    std::set<MediaFormat> formats;
+    params.formats = formats;
+    // Define new format
+    MediaFormat format;
+    format.type = stream_type;
+    format.stream = stream_id;
+    if (stream_type == TYPE_VIDEO) {
+      format.format.video.width = 0;
+      format.format.video.height = 0;
+      format.format.video.cropImage = 0;
     }
+    params.formats.insert(format);
+  }
 
 } // _get decoder params
 
-
-Video::Video(
-    std::string videoPath, 
-    std::string stream, 
-    bool isReadFile) {
-
-
-    //parse stream information
-    current_stream = _parseStream(stream);
-    // note that in the initial call we want to get all streams
-    Video::_getDecoderParams(
-        0,      // video start
-        0,  //headerOnly
-        get<0>(current_stream), // stream info - remove that
-        long(-1),     // stream_id parsed from info above change to -2
-        true    // read all streams
-    );
-
-    std::string logMessage, logType;
-    
-
-    // TODO: add read from memory option
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-    
-    
-    std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
-    std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
-    std::vector<double> audioTB, videoTB, ccTB, subsTB;
-    
-
-    // calback and metadata defined in struct
-    succeeded = decoder.init(params, std::move(callback), &metadata);
-    if (succeeded) {
-        for (const auto& header : metadata) {
-            double fps = double(header.fps);
-            double timeBase = double(header.num) / double(header.den);
-            double duration = double(header.duration) * 1e-6; // * timeBase;
-
-            if (header.format.type == TYPE_VIDEO) {
-                videoMetadata = header;
-                videoFPS.push_back(fps);
-                videoDuration.push_back(duration);
-            } else if (header.format.type == TYPE_AUDIO) {
-                audioFPS.push_back(fps);
-                audioDuration.push_back(duration);
-            } else if (header.format.type == TYPE_CC){
-                ccFPS.push_back(fps);
-                ccDuration.push_back(duration);
-            } else if (header.format.type == TYPE_SUBTITLE){
-                subsFPS.push_back(fps);
-                subsDuration.push_back(duration);
-            };
-        }
-
-    }
-    streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
-    streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
-
-    succeeded = Video::_setCurrentStream(stream);
-    LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
-    if (long(get<1>(current_stream)) != -1) {
-        LOG(INFO) << "Stream index set to " << long(get<1>(current_stream) <<
-        ". If you encounter trouble, consider switching it to automatic stream discovery.\n";
-    }
-} //video
-
-// why is this not woriking? 
-bool Video::_setCurrentStream(std::string stream){  
-    current_stream = _parseStream(stream);
-    double ts = 0;
-    if (seekTS > 0) {
-        ts = seekTS;
+Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
+  // parse stream information
+  current_stream = _parseStream(stream);
+  // note that in the initial call we want to get all streams
+  Video::_getDecoderParams(
+      0, // video start
+      0, // headerOnly
+      get<0>(current_stream), // stream info - remove that
+      long(-1), // stream_id parsed from info above change to -2
+      true // read all streams
+  );
+
+  std::string logMessage, logType;
+
+  // TODO: add read from memory option
+  params.uri = videoPath;
+  logType = "file";
+  logMessage = videoPath;
+
+  std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
+  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
+  std::vector<double> audioTB, videoTB, ccTB, subsTB;
+
+  // calback and metadata defined in struct
+  succeeded = decoder.init(params, std::move(callback), &metadata);
+  if (succeeded) {
+    for (const auto& header : metadata) {
+      double fps = double(header.fps);
+      double timeBase = double(header.num) / double(header.den);
+      double duration = double(header.duration) * 1e-6; // * timeBase;
+
+      if (header.format.type == TYPE_VIDEO) {
+        videoMetadata = header;
+        videoFPS.push_back(fps);
+        videoDuration.push_back(duration);
+      } else if (header.format.type == TYPE_AUDIO) {
+        audioFPS.push_back(fps);
+        audioDuration.push_back(duration);
+      } else if (header.format.type == TYPE_CC) {
+        ccFPS.push_back(fps);
+        ccDuration.push_back(duration);
+      } else if (header.format.type == TYPE_SUBTITLE) {
+        subsFPS.push_back(fps);
+        subsDuration.push_back(duration);
+      };
     }
+  }
+  streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
+  streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+
+  succeeded = Video::_setCurrentStream();
+  LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
+  if (get<1>(current_stream) != -1) {
+    LOG(INFO)
+        << "Stream index set to " << get<1>(current_stream)
+        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
+  }
+} // video
 
-    _getDecoderParams(
-        ts,  // video start
-        0,  //headerOnly
-        get<0>(current_stream), // stream
-        long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
-        false    // read all streams
-    );
-
-    // calback and metadata defined in Video.h
-    return(decoder.init(params, std::move(callback), &metadata));
+bool Video::_setCurrentStream() {
+  double ts = 0;
+  if (seekTS > 0) {
+    ts = seekTS;
+  }
 
+  _getDecoderParams(
+      ts, // video start
+      0, // headerOnly
+      get<0>(current_stream), // stream
+      long(get<1>(
+          current_stream)), // stream_id parsed from info above change to -2
+      false // read all streams
+  );
+
+  // calback and metadata defined in Video.h
+  return (decoder.init(params, std::move(callback), &metadata));
 }
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
-    return current_stream;
+  return current_stream;
 }
 
-std::vector<double> Video::getFPS(std::string stream) const{
-    // add safety check
-    if (stream.empty()){
-        stream = get<0>(current_stream);
-    }
-    auto stream_tpl = _parseStream(stream);
-    std::string stream_str = get<0>(stream_tpl);
-    // check if the stream exists
-    return streamFPS.at(stream_str);
+std::vector<double> Video::getFPS(std::string stream) const {
+  // add safety check
+  if (stream.empty()) {
+    stream = get<0>(current_stream);
+  }
+  auto stream_tpl = _parseStream(stream);
+  std::string stream_str = get<0>(stream_tpl);
+  // check if the stream exists
+  return streamFPS.at(stream_str);
 }
 
-std::vector<double> Video::getDuration(std::string stream) const{
-    // add safety check
-    if (stream.empty()){
-        stream = get<0>(current_stream);
-    }
-    auto stream_tpl = _parseStream(stream);
-    std::string stream_str = get<0>(stream_tpl);
-    // check if the stream exists
-    return streamDuration.at(stream_str);
+std::vector<double> Video::getDuration(std::string stream) const {
+  // add safety check
+  if (stream.empty()) {
+    stream = get<0>(current_stream);
+  }
+  auto stream_tpl = _parseStream(stream);
+  std::string stream_str = get<0>(stream_tpl);
+  // check if the stream exists
+  return streamDuration.at(stream_str);
 }
 
-int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
+void Video::Seek(double ts, bool any_frame = false) {
+  // initialize the class variables used for seeking and retrurn
+  video_any_frame = any_frame;
+  seekTS = ts;
+  doSeek = true;
+}
 
-    // initialize the class variables and retrurn
-    video_any_frame = any_frame;
-    seekTS = ts; 
+torch::List<torch::Tensor> Video::Next(std::string stream) {
 
-    succeeded = Video::_setCurrentStream(stream);
-    if (succeeded){
-        return 0;
-    }
-    return 1;
-}
+  bool newInit = false;
+  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
+      current_stream = _parseStream(stream);
+      newInit = true;
+  }
 
-torch::List<torch::Tensor> Video::Next(std::string stream=""){
+  if ((seekTS != -1) && (doSeek == true)) {
+      newInit = true;
+      doSeek = false;
+  }
 
-    bool switched = false;
-    if ((!stream.empty()) && (_parseStream(stream) != current_stream)){
-        succeeded = Video::_setCurrentStream(stream);
-        if (succeeded){
-            cout << "Switching the stream to new one in next ya'll \n";
-            switched = true;
-        }
+  if (newInit){
+    succeeded = Video::_setCurrentStream();
+    if (succeeded) {
+      newInit = false;
+      // cout << "Reinitializing the decoder again \n";
     }
+  }
 
-    // if failing to decode simply return 0 (note, maybe 
-    // raise an exeption otherwise)
-    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
-    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-
-    // first decode the frame
-    DecoderOutputMessage out;
-    int64_t res = decoder.decode(&out, decoderTimeoutMs);
-    if (res == 0){
-
-        auto header = out.header;
-        const auto& format = header.format;
-
-        if (switched == true) {
-            cout << "now looking at " << format.type <<" \n";
-        }
-
-        // then initialize the output variables based on type
-        size_t expectedWrittenBytes = 0;
-
-        if (format.type == TYPE_VIDEO) {
-            int outHeight = format.format.video.height;
-            int outWidth = format.format.video.width;
-            int numChannels = 3;
-            outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-            expectedWrittenBytes = outHeight * outWidth * numChannels;
-            std::cout << expectedWrittenBytes;
-        } else if (format.type == TYPE_AUDIO) {
-            int outAudioChannels = format.format.audio.channels;
-            int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
-            int frameSizeTotal = out.payload->length();
-            
-            CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-            int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-            outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
-            expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
-        }
-        
-        std::cout << "Successfully allocated tensors to the dimension \n" ;
-        // if not in seek mode or only looking at the keyframes, 
-        // return the immediate next frame 
-        if ((seekTS == -1) || (video_any_frame == false)) {   
-
-            std::cout << "In non-seek mode stuff is happening \n";         
-            if (format.type == TYPE_VIDEO) {
-                auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-            } else {
-                auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
-            }
-            out.payload.reset();
-        }
+  // if failing to decode simply return 0 (note, maybe
+  // raise an exeption otherwise)
+  torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
+
+  // first decode the frame
+  DecoderOutputMessage out;
+  int64_t res = decoder.decode(&out, decoderTimeoutMs);
+  if (res == 0) {
+    auto header = out.header;
+    const auto& format = header.format;
+
+    // then initialize the output variables based on type
+    size_t expectedWrittenBytes = 0;
+
+    if (format.type == TYPE_VIDEO) {
+      int outHeight = format.format.video.height;
+      int outWidth = format.format.video.width;
+      int numChannels = 3;
+      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+      expectedWrittenBytes = outHeight * outWidth * numChannels;
+      // std::cout << expectedWrittenBytes;
+    } else if (format.type == TYPE_AUDIO) {
+      int outAudioChannels = format.format.audio.channels;
+      int bytesPerSample = av_get_bytes_per_sample(
+          static_cast<AVSampleFormat>(format.format.audio.format));
+      int frameSizeTotal = out.payload->length();
+
+      CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+      int numAudioSamples =
+          frameSizeTotal / (outAudioChannels * bytesPerSample);
+
+      outFrame =
+          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+
+      expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
-    else {
-        LOG(ERROR) << "Decoder run into a last iteration or has failed";
+
+    // std::cout << "Successfully allocated tensors to the dimension \n";
+    // if not in seek mode or only looking at the keyframes,
+    // return the immediate next frame
+    if ((seekTS == -1) || (video_any_frame == false)) {
+      // std::cout << "In non-seek mode stuff is happening \n";
+      if (format.type == TYPE_VIDEO) {
+        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+      } else {
+        auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+      }
+      out.payload.reset();
     }
+  } else {
+    LOG(ERROR) << "Decoder run into a last iteration or has failed";
+  }
 
-    torch::List<torch::Tensor> result;
-    result.push_back(outFrame);
-    result.push_back(framePTS);
-    return result;
+  torch::List<torch::Tensor> result;
+  result.push_back(outFrame);
+  result.push_back(framePTS);
+  return result;
 }
 
-
-
-
+Video::~Video() {
+//   delete params; // does not have destructor
+//   delete metadata; // struct does not have destructor
+//   delete decoder; // should be fine
+//   delete streamFPS; // should be fine
+//   delete streamDuration; // should be fine
+}
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 5a0b706d1aa..eb32e83a64b 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -28,25 +28,27 @@ using namespace ffmpeg;
 struct Video : torch::CustomClassHolder {
     bool video_any_frame=false; // add this to input parameters
     bool succeeded=false; // this is decoder init stuff
-    // this acts as a flag - if it's not set, next function simply
+    // seekTS acts as a flag - if it's not set, next function simply
     // retruns the next frame. If it's set, we look at the global seek
     // time in comination with any_frame settings
     double seekTS=-1; 
-    std::tuple<std::string, int64_t> current_stream;
+    bool doSeek=false;
+    std::tuple<std::string, long> current_stream;
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
     DecoderMetadata videoMetadata;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
+        ~Video();
         std::tuple<std::string, int64_t> getCurrentStream() const;
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
-        int64_t Seek(double ts, std::string stream, bool any_frame);
+        void Seek(double ts, bool any_frame);
         torch::List<torch::Tensor> Next(std::string stream); //
 
     private:
         void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-        bool _setCurrentStream(std::string stream="video");
+        bool _setCurrentStream();
         std::map<std::string, std::vector<double>> streamTimeBase;
 
         SyncDecoder decoder;
@@ -60,8 +62,6 @@ struct Video : torch::CustomClassHolder {
     protected:
         // AV container type (check in decoder for exact type)
 
-        // int64_t SecToStream(double ts); // TODO: add stream type
-        // double StreamToSec(int64_t pts); // TODO: add stream type
     
 
 }; // struct Video

From 8b7644dac88572caddaf24f9290dcb53840c3f83 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 16 Sep 2020 07:19:01 -0500
Subject: [PATCH 027/128] cleanup and comments

---
 torchvision/csrc/cpu/video/Video.cpp    | 46 ++++++++++++-------------
 torchvision/csrc/cpu/video/Video.h      | 38 ++++++++++----------
 torchvision/csrc/cpu/video/register.cpp |  1 -
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d5b5e4f0ed4..c4b330d1d29 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -225,7 +225,6 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
       double duration = double(header.duration) * 1e-6; // * timeBase;
 
       if (header.format.type == TYPE_VIDEO) {
-        videoMetadata = header;
         videoFPS.push_back(fps);
         videoDuration.push_back(duration);
       } else if (header.format.type == TYPE_AUDIO) {
@@ -305,8 +304,8 @@ void Video::Seek(double ts, bool any_frame = false) {
 }
 
 torch::List<torch::Tensor> Video::Next(std::string stream) {
-
-  bool newInit = false;
+  
+  bool newInit = false; // avoid unnecessary decoder initializations
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
       current_stream = _parseStream(stream);
       newInit = true;
@@ -321,32 +320,34 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
     succeeded = Video::_setCurrentStream();
     if (succeeded) {
       newInit = false;
-      // cout << "Reinitializing the decoder again \n";
     }
   }
 
-  // if failing to decode simply return 0 (note, maybe
-  // raise an exeption otherwise)
+  // if failing to decode simply return a null tensor (note, should we
+  // raise an exeption?)
   torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
-  // first decode the frame
+  // decode single frame
   DecoderOutputMessage out;
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
+  // if successfull
   if (res == 0) {
     auto header = out.header;
     const auto& format = header.format;
 
-    // then initialize the output variables based on type
+    // initialize the output variables based on type
     size_t expectedWrittenBytes = 0;
 
     if (format.type == TYPE_VIDEO) {
+      // note: this can potentially be optimized
+      // by having the global tensor that we fill at decode time
+      // (would avoid allocations)
       int outHeight = format.format.video.height;
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
       expectedWrittenBytes = outHeight * outWidth * numChannels;
-      // std::cout << expectedWrittenBytes;
     } else if (format.type == TYPE_AUDIO) {
       int outAudioChannels = format.format.audio.channels;
       int bytesPerSample = av_get_bytes_per_sample(
@@ -362,21 +363,17 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
 
       expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
+    // currently not supporting other formats (will do soon)
 
-    // std::cout << "Successfully allocated tensors to the dimension \n";
-    // if not in seek mode or only looking at the keyframes,
-    // return the immediate next frame
-    if ((seekTS == -1) || (video_any_frame == false)) {
-      // std::cout << "In non-seek mode stuff is happening \n";
-      if (format.type == TYPE_VIDEO) {
-        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-      } else {
-        auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
-      }
-      out.payload.reset();
+    // note: this will need to be revised to support less-accurate seek. So far keep as is
+    if (format.type == TYPE_VIDEO) {
+      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+    } else {
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
     }
-  } else {
-    LOG(ERROR) << "Decoder run into a last iteration or has failed";
+    out.payload.reset();
+  } else{
+    LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
   }
 
   torch::List<torch::Tensor> result;
@@ -385,10 +382,11 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
   return result;
 }
 
-Video::~Video() {
+// Video::~Video() {
+  // destructor to be defined thoroughly later
 //   delete params; // does not have destructor
 //   delete metadata; // struct does not have destructor
 //   delete decoder; // should be fine
 //   delete streamFPS; // should be fine
 //   delete streamDuration; // should be fine
-}
+// }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index eb32e83a64b..1db5ed0ae79 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -26,42 +26,40 @@ using namespace ffmpeg;
 
 
 struct Video : torch::CustomClassHolder {
-    bool video_any_frame=false; // add this to input parameters
-    bool succeeded=false; // this is decoder init stuff
-    // seekTS acts as a flag - if it's not set, next function simply
-    // retruns the next frame. If it's set, we look at the global seek
-    // time in comination with any_frame settings
-    double seekTS=-1; 
-    bool doSeek=false;
-    std::tuple<std::string, long> current_stream;
+    
+    
+    
+    std::tuple<std::string, long> current_stream; // streaam type, id
+    // global video metadata
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
-    DecoderMetadata videoMetadata;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
-        ~Video();
         std::tuple<std::string, int64_t> getCurrentStream() const;
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
         void Seek(double ts, bool any_frame);
-        torch::List<torch::Tensor> Next(std::string stream); //
+        torch::List<torch::Tensor> Next(std::string stream);
 
     private:
+        bool video_any_frame=false; // add this to input parameters?
+        bool succeeded=false; // decoder init flag
+        // seekTS and doSeek act as a flag - if it's not set, next function simply
+        // retruns the next frame. If it's set, we look at the global seek
+        // time in comination with any_frame settings
+        double seekTS=-1; 
+        bool doSeek=false;
+
         void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
         bool _setCurrentStream();
-        std::map<std::string, std::vector<double>> streamTimeBase;
-
-        SyncDecoder decoder;
-        DecoderParameters params;
+        std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
         DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
-        
-        
-        // torch::List<torch::Tensor> Peak(std::string stream="")
+                
     protected:
-        // AV container type (check in decoder for exact type)
-
+        SyncDecoder decoder;
+        DecoderParameters params;
     
 
 }; // struct Video
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 9a0fafeee6a..091052f4808 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -14,6 +14,5 @@ static auto registerVideo =
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);
 
-
 } //namespace
 #endif

From cddc92eb4cbbf07ce4bd11c53be2b6419a966afa Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 17 Sep 2020 07:06:05 -0500
Subject: [PATCH 028/128] introducing new tests for the API

---
 test/test_video.py                   | 160 +++++++++++++++++++++++++++
 torchvision/csrc/cpu/video/Video.cpp |   1 +
 2 files changed, 161 insertions(+)
 create mode 100644 test/test_video.py

diff --git a/test/test_video.py b/test/test_video.py
new file mode 100644
index 00000000000..76a0dd60c06
--- /dev/null
+++ b/test/test_video.py
@@ -0,0 +1,160 @@
+import os
+import collections
+import contextlib
+import tempfile
+import unittest
+
+
+import numpy as np
+
+import torch
+import torchvision
+from torchvision.io import _HAS_VIDEO_OPT
+
+
+
+VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
+
+CheckerConfig = [
+    "duration",
+    "video_fps",
+    "audio_sample_rate",
+    # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are
+    # slightly different between TorchVision decoder and PyAv decoder. So omit it during check
+    "check_aframes",
+    "check_aframe_pts",
+]
+GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
+
+all_check_config = GroundTruth(
+    duration=0,
+    video_fps=0,
+    audio_sample_rate=0,
+    check_aframes=True,
+    check_aframe_pts=True,
+)
+
+test_videos = {
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(
+        duration=2.0,
+        video_fps=30.0,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
+        duration=2.0,
+        video_fps=30.0,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(
+        duration=2.0,
+        video_fps=30.0,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "v_SoccerJuggling_g23_c01.avi": GroundTruth(
+        duration=8.0,
+        video_fps=29.97,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "v_SoccerJuggling_g24_c01.avi": GroundTruth(
+        duration=8.0,
+        video_fps=29.97,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    ### Last three test segfault on video reader (see issues)
+    # "R6llTwEh07w.mp4": GroundTruth(
+    #     duration=10.0,
+    #     video_fps=30.0,
+    #     audio_sample_rate=44100,
+    #     # PyAv miss one audio frame at the beginning (pts=0)
+    #     check_aframes=False,
+    #     check_aframe_pts=False,
+    # ),
+    # "SOX5yA1l24A.mp4": GroundTruth(
+    #     duration=11.0,
+    #     video_fps=29.97,
+    #     audio_sample_rate=48000,
+    #     # PyAv miss one audio frame at the beginning (pts=0)
+    #     check_aframes=False,
+    #     check_aframe_pts=False,
+    # ),
+    # "WUzgd7C1pWA.mp4": GroundTruth(
+    #     duration=11.0,
+    #     video_fps=29.97,
+    #     audio_sample_rate=48000,
+    #     # PyAv miss one audio frame at the beginning (pts=0)
+    #     check_aframes=False,
+    #     check_aframe_pts=False,
+    # ),
+}
+
+
+@unittest.skipIf(_HAS_VIDEO_OPT is False, "Didn't compile with ffmpeg")
+class TestVideo(unittest.TestCase):
+    def test_read_video_tensor(self):
+        """
+        Check if reading the video using the `next` based API yields the
+        same sized and equal tensors as video_reader.
+        """
+        print("test read")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            print(test_video)
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            # pass 1: decode all frames using existing TV decoder
+            tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
+            # pass 2: decode all frames using new api
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            frames = []
+            t, _ = reader.next("")
+            while t.numel() > 0:
+                frames.append(t)
+                t, _ = reader.next("")
+            new_api = torch.stack(frames, 0)
+            self.assertEqual(tv_result.size(), new_api.size())
+            self.assertEqual(torch.equal(tv_result, new_api), True)
+    
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_pts(self):
+        """Check if the frames have the same timestamps
+        """
+        print("test timestamp")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            tv_timestamps, _ =  torchvision.io.read_video_timestamps(full_path, pts_unit='sec')
+            # pass 2: decode all frames using new api
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            pts = []
+            t, p = reader.next("")
+            while t.numel() > 0:
+                pts.append(p)
+                t, p = reader.next("")
+            
+            tv_timestamps = [float(p) for p in tv_timestamps]
+            napi_pts = [float(p.item()) for p in pts]
+            for i in range(len(napi_pts)):
+                self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
+
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_metadata(self):
+        print("test fps")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            self.assertAlmostEqual(config.video_fps, reader.fps("")[0], delta=0.0001)
+            self.assertAlmostEqual(config.duration, reader.duration("")[0], delta=0.5)
+      
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index c4b330d1d29..1c633c8874d 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -187,6 +187,7 @@ void Video::_getDecoderParams(
       format.format.video.width = 0;
       format.format.video.height = 0;
       format.format.video.cropImage = 0;
+      format.format.video.format = defaultVideoPixelFormat;
     }
     params.formats.insert(format);
   }

From 6411cf92f125d3a21982d70d4260b97882aab2b7 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 17 Sep 2020 07:06:22 -0500
Subject: [PATCH 029/128] cleanup

---
 video_reader.todo | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 video_reader.todo

diff --git a/video_reader.todo b/video_reader.todo
deleted file mode 100644
index cc9bc343f23..00000000000
--- a/video_reader.todo
+++ /dev/null
@@ -1,19 +0,0 @@
-Documented edgecases that don't work:
-    ☐ seeking with anyframe=True
-    ✔ last frame segfaults - exit cleanly @started(20-09-02 10:44) @done(20-09-02 11:07) @lasted(23m1s)
-    ✔ switching modalities in the subsequent calls to next() @done(20-09-03 05:33)
-    ```
-    video.next("video:0")
-    video.next("video:0")
-    video.next("audio:0")
-    ```
-
-
-Random todo's:
-    ✔ add check for the current stream @done(20-09-02 06:37)
-    ☐ ensure warning if stream is out of bounds
-    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
-    ☐ can we make this an iterable?
-    ☐ destructors
-    ☐ adding tests to test.py
-    ☐ thorough checking for memory leaks
\ No newline at end of file

From 3edd9f4eff110d61b2273c75a82edf16a0456f42 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 04:53:45 -0500
Subject: [PATCH 030/128] Comment out unnecesary format (will add following
 FFMPEG fix)

---
 torchvision/csrc/cpu/video/Video.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 1c633c8874d..4cad0a9ebf4 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -187,7 +187,7 @@ void Video::_getDecoderParams(
       format.format.video.width = 0;
       format.format.video.height = 0;
       format.format.video.cropImage = 0;
-      format.format.video.format = defaultVideoPixelFormat;
+      // format.format.video.format = defaultVideoPixelFormat;
     }
     params.formats.insert(format);
   }

From 1b6df6525a804e507a479caa6dee7ffd17028b2b Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:07:45 -0500
Subject: [PATCH 031/128] Reformat parsing function

---
 torchvision/csrc/cpu/video/Video.cpp | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4cad0a9ebf4..2ea480f1b1a 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -25,6 +25,7 @@ using namespace ffmpeg;
 // #endif
 // #endif
 
+
 const size_t decoderTimeoutMs = 600000;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
@@ -69,7 +70,7 @@ size_t fillAudioTensor(
   return fillTensorList<float>(msgs, audioFrame, audioFramePts);
 }
 
-std::string parse_type_to_string(const std::string& stream_string) {
+std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ffmpeg::MediaType> const* _parse_type(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
       {"audio", TYPE_AUDIO},
@@ -83,28 +84,19 @@ std::string parse_type_to_string(const std::string& stream_string) {
         return p.first == stream_string;
       });
   if (device != types.end()) {
-    return device->first;
+    return device;
   }
   AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
+std::string parse_type_to_string(const std::string& stream_string){
+    auto device = _parse_type(stream_string);
+    return device->first;
+}
+
 MediaType parse_type_to_mt(const std::string& stream_string) {
-  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
-      {"video", TYPE_VIDEO},
-      {"audio", TYPE_AUDIO},
-      {"subtitle", TYPE_SUBTITLE},
-      {"cc", TYPE_CC},
-  }};
-  auto device = std::find_if(
-      types.begin(),
-      types.end(),
-      [stream_string](const std::pair<std::string, MediaType>& p) {
-        return p.first == stream_string;
-      });
-  if (device != types.end()) {
+    auto device = _parse_type(stream_string);
     return device->second;
-  }
-  AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
 std::tuple<std::string, long> _parseStream(const std::string& streamString) {

From c245c4f7f2793f46996c82797064701a99ab6926 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:11:29 -0500
Subject: [PATCH 032/128] removing the seek bug `get_decoder_params`

---
 torchvision/csrc/cpu/video/Video.cpp | 3 ++-
 torchvision/csrc/cpu/video/Video.h   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 2ea480f1b1a..0cc22863cf2 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -129,12 +129,13 @@ std::tuple<std::string, long> _parseStream(const std::string& streamString) {
 }
 
 void Video::_getDecoderParams(
-    int64_t videoStartS,
+    double videoStartS,
     int64_t getPtsOnly,
     std::string stream,
     long stream_id = -1,
     bool all_streams = false,
     double seekFrameMarginUs = 10) {
+  
   int64_t videoStartUs = int64_t(videoStartS * 1e6);
 
   params.timeoutMs = decoderTimeoutMs;
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 1db5ed0ae79..cba97cfade3 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -50,7 +50,7 @@ struct Video : torch::CustomClassHolder {
         double seekTS=-1; 
         bool doSeek=false;
 
-        void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(double videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
         bool _setCurrentStream();
         std::map<std::string, std::vector<double>> streamTimeBase; // not used
 

From e779779dca74e2910923c3e8b43f0efba5deddc8 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:19:30 -0500
Subject: [PATCH 033/128] Removing unnecessary code/variables

---
 torchvision/csrc/cpu/video/Video.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 0cc22863cf2..d3680612689 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -140,7 +140,7 @@ void Video::_getDecoderParams(
 
   params.timeoutMs = decoderTimeoutMs;
   params.startOffset = videoStartUs;
-  params.seekAccuracy = 10;
+  params.seekAccuracy = seekFrameMarginUs;
   params.headerOnly = false;
 
   params.preventStaleness = false; // not sure what this is about
@@ -331,7 +331,6 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
     const auto& format = header.format;
 
     // initialize the output variables based on type
-    size_t expectedWrittenBytes = 0;
 
     if (format.type == TYPE_VIDEO) {
       // note: this can potentially be optimized
@@ -341,7 +340,6 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      expectedWrittenBytes = outHeight * outWidth * numChannels;
     } else if (format.type == TYPE_AUDIO) {
       int outAudioChannels = format.format.audio.channels;
       int bytesPerSample = av_get_bytes_per_sample(
@@ -355,7 +353,6 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
 
-      expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
     // currently not supporting other formats (will do soon)
 

From 62b6aa860c9ca36fefd7bbc4ad9d30752accf3eb Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:23:19 -0500
Subject: [PATCH 034/128] enforce RGB24 as a reading format (will crash before
 ffmpeg fix)

---
 torchvision/csrc/cpu/video/Video.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d3680612689..fabcee43e9f 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -27,6 +27,7 @@ using namespace ffmpeg;
 
 
 const size_t decoderTimeoutMs = 600000;
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
 // error, small value 100us won't be enough to select the next frame, but enough
@@ -180,7 +181,7 @@ void Video::_getDecoderParams(
       format.format.video.width = 0;
       format.format.video.height = 0;
       format.format.video.cropImage = 0;
-      // format.format.video.format = defaultVideoPixelFormat;
+      format.format.video.format = defaultVideoPixelFormat;
     }
     params.formats.insert(format);
   }

From d4aca2853bf795847ff46370f4b60269cb4bf976 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:40:32 -0500
Subject: [PATCH 035/128] permute the dimensions to return (RGB x H x W)

---
 test/test_video.py                   |  1 +
 torchvision/csrc/cpu/video/Video.cpp | 12 +++++-------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 76a0dd60c06..c7afb6a8aa9 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -112,6 +112,7 @@ def test_read_video_tensor(self):
             full_path = os.path.join(VIDEO_DIR, test_video)
             # pass 1: decode all frames using existing TV decoder
             tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
+            tv_result = tv_result.permute(0, 3, 1, 2)
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video", True)
             frames = []
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index fabcee43e9f..a3e2c31cfb5 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -341,6 +341,9 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+      outFrame = outFrame.permute({2, 0, 1});
+
     } else if (format.type == TYPE_AUDIO) {
       int outAudioChannels = format.format.audio.channels;
       int bytesPerSample = av_get_bytes_per_sample(
@@ -353,16 +356,11 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
 
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
+      
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS); 
     }
     // currently not supporting other formats (will do soon)
 
-    // note: this will need to be revised to support less-accurate seek. So far keep as is
-    if (format.type == TYPE_VIDEO) {
-      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-    } else {
-      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
-    }
     out.payload.reset();
   } else{
     LOG(ERROR) << "Decoder failed ( or ran into last iteration)";

From b9c8d5c106e1ba8dbdc3552e77fcb05002440507 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:54:53 -0500
Subject: [PATCH 036/128] Changing the return type to std::tuple<torch::Tensor,
 double> as opposed to tensor list

---
 torchvision/csrc/cpu/video/Video.cpp | 37 +++++++++++-----------------
 torchvision/csrc/cpu/video/Video.h   |  2 +-
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index a3e2c31cfb5..4ff1d9102c0 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -38,18 +38,10 @@ const size_t timeBaseJitterUs = 100;
 template <typename T>
 size_t fillTensorList(
     DecoderOutputMessage& msgs,
-    torch::Tensor& frame,
-    torch::Tensor& framePts) {
-  // set up PTS data
+    torch::Tensor& frame) {
+  
   const auto& msg = msgs;
-
-  float* framePtsData = framePts.data_ptr<float>();
-
-  float pts_s = float(float(msg.header.pts) * 1e-6);
-  framePtsData[0] = pts_s;
-
   T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-
   if (frameData) {
     auto sizeInBytes = msg.payload->length();
     memcpy(frameData, msg.payload->data(), sizeInBytes);
@@ -59,16 +51,14 @@ size_t fillTensorList(
 
 size_t fillVideoTensor(
     DecoderOutputMessage& msgs,
-    torch::Tensor& videoFrame,
-    torch::Tensor& videoFramePts) {
-  return fillTensorList<uint8_t>(msgs, videoFrame, videoFramePts);
+    torch::Tensor& videoFrame) {
+  return fillTensorList<uint8_t>(msgs, videoFrame);
 }
 
 size_t fillAudioTensor(
     DecoderOutputMessage& msgs,
-    torch::Tensor& audioFrame,
-    torch::Tensor& audioFramePts) {
-  return fillTensorList<float>(msgs, audioFrame, audioFramePts);
+    torch::Tensor& audioFrame) {
+  return fillTensorList<float>(msgs, audioFrame);
 }
 
 std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ffmpeg::MediaType> const* _parse_type(const std::string& stream_string) {
@@ -298,7 +288,7 @@ void Video::Seek(double ts, bool any_frame = false) {
   doSeek = true;
 }
 
-torch::List<torch::Tensor> Video::Next(std::string stream) {
+std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
   
   bool newInit = false; // avoid unnecessary decoder initializations
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
@@ -320,7 +310,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
 
   // if failing to decode simply return a null tensor (note, should we
   // raise an exeption?)
-  torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+  double frame_pts_s;
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
   // decode single frame
@@ -328,6 +318,9 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
   // if successfull
   if (res == 0) {
+
+    frame_pts_s = double(double(out.header.pts) * 1e-6);
+    
     auto header = out.header;
     const auto& format = header.format;
 
@@ -341,7 +334,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+      auto numberWrittenBytes = fillVideoTensor(out, outFrame);
       outFrame = outFrame.permute({2, 0, 1});
 
     } else if (format.type == TYPE_AUDIO) {
@@ -357,7 +350,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
       
-      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS); 
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame); 
     }
     // currently not supporting other formats (will do soon)
 
@@ -366,9 +359,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
     LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
   }
 
-  torch::List<torch::Tensor> result;
-  result.push_back(outFrame);
-  result.push_back(framePTS);
+  std::tuple<torch::Tensor, double> result = {outFrame, frame_pts_s};
   return result;
 }
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index cba97cfade3..0a45aff72c7 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -39,7 +39,7 @@ struct Video : torch::CustomClassHolder {
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
         void Seek(double ts, bool any_frame);
-        torch::List<torch::Tensor> Next(std::string stream);
+        std::tuple<torch::Tensor, double> Next(std::string stream);
 
     private:
         bool video_any_frame=false; // add this to input parameters?

From 430ed5caa3386a288c2254e97aa47af2370de0c8 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:55:17 -0500
Subject: [PATCH 037/128] Adjusting tests for the new return type

---
 test/test_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_video.py b/test/test_video.py
index c7afb6a8aa9..32a36b63cb0 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -143,7 +143,7 @@ def test_pts(self):
                 t, p = reader.next("")
             
             tv_timestamps = [float(p) for p in tv_timestamps]
-            napi_pts = [float(p.item()) for p in pts]
+            napi_pts = [float(p) for p in pts]
             for i in range(len(napi_pts)):
                 self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
 

From 80bda0e68f2e026f7b72cf60aae6e6db1227a8c9 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 06:00:01 -0500
Subject: [PATCH 038/128] remove unnecessary jitter

---
 torchvision/csrc/cpu/video/Video.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4ff1d9102c0..c237138f13c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,10 +29,6 @@ using namespace ffmpeg;
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
-// A jitter can be added to the end of the range to avoid conversion/rounding
-// error, small value 100us won't be enough to select the next frame, but enough
-// to compensate rounding error due to the multiple conversions.
-const size_t timeBaseJitterUs = 100;
 
 // returns number of written bytes
 template <typename T>

From 7d1dc1fd36962f0512accb31b48e02e1fc516ff7 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 06:13:37 -0500
Subject: [PATCH 039/128] clangangangang

---
 torchvision/csrc/cpu/video/Video.cpp    | 53 +++++++--------
 torchvision/csrc/cpu/video/Video.h      | 90 ++++++++++++-------------
 torchvision/csrc/cpu/video/register.cpp |  2 +-
 3 files changed, 68 insertions(+), 77 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index c237138f13c..9f3537b06e5 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -25,17 +25,13 @@ using namespace ffmpeg;
 // #endif
 // #endif
 
-
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 
 // returns number of written bytes
 template <typename T>
-size_t fillTensorList(
-    DecoderOutputMessage& msgs,
-    torch::Tensor& frame) {
-  
+size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
   const auto& msg = msgs;
   T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
   if (frameData) {
@@ -45,19 +41,19 @@ size_t fillTensorList(
   return sizeof(T);
 }
 
-size_t fillVideoTensor(
-    DecoderOutputMessage& msgs,
-    torch::Tensor& videoFrame) {
+size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
   return fillTensorList<uint8_t>(msgs, videoFrame);
 }
 
-size_t fillAudioTensor(
-    DecoderOutputMessage& msgs,
-    torch::Tensor& audioFrame) {
+size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
   return fillTensorList<float>(msgs, audioFrame);
 }
 
-std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ffmpeg::MediaType> const* _parse_type(const std::string& stream_string) {
+std::pair<
+    std::__cxx11::
+        basic_string<char, std::char_traits<char>, std::allocator<char>>,
+    ffmpeg::MediaType> const*
+_parse_type(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
       {"audio", TYPE_AUDIO},
@@ -76,14 +72,14 @@ std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocato
   AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::string parse_type_to_string(const std::string& stream_string){
-    auto device = _parse_type(stream_string);
-    return device->first;
+std::string parse_type_to_string(const std::string& stream_string) {
+  auto device = _parse_type(stream_string);
+  return device->first;
 }
 
 MediaType parse_type_to_mt(const std::string& stream_string) {
-    auto device = _parse_type(stream_string);
-    return device->second;
+  auto device = _parse_type(stream_string);
+  return device->second;
 }
 
 std::tuple<std::string, long> _parseStream(const std::string& streamString) {
@@ -122,7 +118,6 @@ void Video::_getDecoderParams(
     long stream_id = -1,
     bool all_streams = false,
     double seekFrameMarginUs = 10) {
-  
   int64_t videoStartUs = int64_t(videoStartS * 1e6);
 
   params.timeoutMs = decoderTimeoutMs;
@@ -285,19 +280,18 @@ void Video::Seek(double ts, bool any_frame = false) {
 }
 
 std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
-  
   bool newInit = false; // avoid unnecessary decoder initializations
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-      current_stream = _parseStream(stream);
-      newInit = true;
+    current_stream = _parseStream(stream);
+    newInit = true;
   }
 
   if ((seekTS != -1) && (doSeek == true)) {
-      newInit = true;
-      doSeek = false;
+    newInit = true;
+    doSeek = false;
   }
 
-  if (newInit){
+  if (newInit) {
     succeeded = Video::_setCurrentStream();
     if (succeeded) {
       newInit = false;
@@ -314,9 +308,8 @@ std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
   // if successfull
   if (res == 0) {
-
     frame_pts_s = double(double(out.header.pts) * 1e-6);
-    
+
     auto header = out.header;
     const auto& format = header.format;
 
@@ -345,13 +338,13 @@ std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
 
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-      
-      auto numberWrittenBytes = fillAudioTensor(out, outFrame); 
+
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame);
     }
     // currently not supporting other formats (will do soon)
 
     out.payload.reset();
-  } else{
+  } else {
     LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
   }
 
@@ -360,7 +353,7 @@ std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
 }
 
 // Video::~Video() {
-  // destructor to be defined thoroughly later
+// destructor to be defined thoroughly later
 //   delete params; // does not have destructor
 //   delete metadata; // struct does not have destructor
 //   delete decoder; // should be fine
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 0a45aff72c7..d9e0a4b7a44 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -3,66 +3,64 @@
 #ifndef VIDEO_H_
 #define VIDEO_H_
 
-
+#include <map>
+#include <regex>
 #include <string>
 #include <vector>
-#include <regex>
-#include <map>
 
 #include <ATen/ATen.h>
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
 
-
 #include <exception>
-#include "sync_decoder.h"
-#include "memory_buffer.h"
 #include "defs.h"
-
+#include "memory_buffer.h"
+#include "sync_decoder.h"
 
 using namespace ffmpeg;
 
-
-
 struct Video : torch::CustomClassHolder {
-    
-    
-    
-    std::tuple<std::string, long> current_stream; // streaam type, id
-    // global video metadata
-    std::map<std::string, std::vector<double>> streamFPS;
-    std::map<std::string, std::vector<double>> streamDuration;
-    public:
-        Video(std::string videoPath, std::string stream, bool isReadFile);
-        std::tuple<std::string, int64_t> getCurrentStream() const;
-        std::vector<double> getDuration(std::string stream="") const;
-        std::vector<double> getFPS(std::string stream="") const;
-        void Seek(double ts, bool any_frame);
-        std::tuple<torch::Tensor, double> Next(std::string stream);
-
-    private:
-        bool video_any_frame=false; // add this to input parameters?
-        bool succeeded=false; // decoder init flag
-        // seekTS and doSeek act as a flag - if it's not set, next function simply
-        // retruns the next frame. If it's set, we look at the global seek
-        // time in comination with any_frame settings
-        double seekTS=-1; 
-        bool doSeek=false;
-
-        void _getDecoderParams(double videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-        bool _setCurrentStream();
-        std::map<std::string, std::vector<double>> streamTimeBase; // not used
-
-        DecoderInCallback callback = nullptr;;
-        std::vector<DecoderMetadata> metadata;
-                
-    protected:
-        SyncDecoder decoder;
-        DecoderParameters params;
-    
+  std::tuple<std::string, long> current_stream; // streaam type, id
+  // global video metadata
+  std::map<std::string, std::vector<double>> streamFPS;
+  std::map<std::string, std::vector<double>> streamDuration;
+
+ public:
+  Video(std::string videoPath, std::string stream, bool isReadFile);
+  std::tuple<std::string, int64_t> getCurrentStream() const;
+  std::vector<double> getDuration(std::string stream = "") const;
+  std::vector<double> getFPS(std::string stream = "") const;
+  void Seek(double ts, bool any_frame);
+  std::tuple<torch::Tensor, double> Next(std::string stream);
+
+ private:
+  bool video_any_frame = false; // add this to input parameters?
+  bool succeeded = false; // decoder init flag
+  // seekTS and doSeek act as a flag - if it's not set, next function simply
+  // retruns the next frame. If it's set, we look at the global seek
+  // time in comination with any_frame settings
+  double seekTS = -1;
+  bool doSeek = false;
+
+  void _getDecoderParams(
+      double videoStartS,
+      int64_t getPtsOnly,
+      std::string stream,
+      long stream_id,
+      bool all_streams,
+      double seekFrameMarginUs); // this needs to be improved
+  bool _setCurrentStream();
+  std::map<std::string, std::vector<double>> streamTimeBase; // not used
+
+  DecoderInCallback callback = nullptr;
+  ;
+  std::vector<DecoderMetadata> metadata;
+
+ protected:
+  SyncDecoder decoder;
+  DecoderParameters params;
 
 }; // struct Video
 
-
-#endif  // VIDEO_H_
+#endif // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 091052f4808..cfd2d2dd5c1 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -14,5 +14,5 @@ static auto registerVideo =
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);
 
-} //namespace
+} // namespace
 #endif

From c6242666fd52df28d2b651cacf8b6358fd2eddc4 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 24 Sep 2020 10:43:02 +0100
Subject: [PATCH 040/128] Metadata return changes (#1)

---
 test/test_video.py                      |  6 ++---
 torchvision/csrc/cpu/video/Video.cpp    | 31 +++++++------------------
 torchvision/csrc/cpu/video/Video.h      |  6 ++---
 torchvision/csrc/cpu/video/register.cpp |  3 +--
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 32a36b63cb0..05e75e2f598 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -108,7 +108,6 @@ def test_read_video_tensor(self):
         print("test read")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
-            print(test_video)
             full_path = os.path.join(VIDEO_DIR, test_video)
             # pass 1: decode all frames using existing TV decoder
             tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
@@ -154,8 +153,9 @@ def test_metadata(self):
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
             reader = torch.classes.torchvision.Video(full_path, "video", True)
-            self.assertAlmostEqual(config.video_fps, reader.fps("")[0], delta=0.0001)
-            self.assertAlmostEqual(config.duration, reader.duration("")[0], delta=0.5)
+            reader_md = reader.get_metadata()
+            self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
+            self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
       
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 9f3537b06e5..e073fd62d5c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -188,9 +188,13 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   logType = "file";
   logMessage = videoPath;
 
+  // locals
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
+  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
+  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
+
 
   // calback and metadata defined in struct
   succeeded = decoder.init(params, std::move(callback), &metadata);
@@ -215,8 +219,9 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
       };
     }
   }
-  streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
-  streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+  audioMetadata.insert({{"duration", audioDuration}, {"framerate", audioFPS}});
+  videoMetadata.insert({{"duration", videoDuration}, {"fps", videoFPS}});
+  streamsMetadata.insert({{"video", videoMetadata}, {"audio", audioMetadata}});
 
   succeeded = Video::_setCurrentStream();
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
@@ -250,26 +255,8 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
   return current_stream;
 }
 
-std::vector<double> Video::getFPS(std::string stream) const {
-  // add safety check
-  if (stream.empty()) {
-    stream = get<0>(current_stream);
-  }
-  auto stream_tpl = _parseStream(stream);
-  std::string stream_str = get<0>(stream_tpl);
-  // check if the stream exists
-  return streamFPS.at(stream_str);
-}
-
-std::vector<double> Video::getDuration(std::string stream) const {
-  // add safety check
-  if (stream.empty()) {
-    stream = get<0>(current_stream);
-  }
-  auto stream_tpl = _parseStream(stream);
-  std::string stream_str = get<0>(stream_tpl);
-  // check if the stream exists
-  return streamDuration.at(stream_str);
+std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
+  return streamsMetadata;
 }
 
 void Video::Seek(double ts, bool any_frame = false) {
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index d9e0a4b7a44..b8db513d83f 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -23,14 +23,12 @@ using namespace ffmpeg;
 struct Video : torch::CustomClassHolder {
   std::tuple<std::string, long> current_stream; // streaam type, id
   // global video metadata
-  std::map<std::string, std::vector<double>> streamFPS;
-  std::map<std::string, std::vector<double>> streamDuration;
+  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
 
  public:
   Video(std::string videoPath, std::string stream, bool isReadFile);
   std::tuple<std::string, int64_t> getCurrentStream() const;
-  std::vector<double> getDuration(std::string stream = "") const;
-  std::vector<double> getFPS(std::string stream = "") const;
+  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
   void Seek(double ts, bool any_frame);
   std::tuple<torch::Tensor, double> Next(std::string stream);
 
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index cfd2d2dd5c1..8f3f46072f5 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -9,8 +9,7 @@ static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
         .def("get_current_stream", &Video::getCurrentStream)
-        .def("duration", &Video::getDuration)
-        .def("fps", &Video::getFPS)
+        .def("get_metadata", &Video::getStreamMetadata)
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);
 

From 0cd8ee559e2e3b27d97a78a9887a176a5e0421af Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 24 Sep 2020 16:09:31 +0100
Subject: [PATCH 041/128] remove implicit calls to set a current stream (#2)

---
 torchvision/csrc/cpu/video/Video.cpp    | 63 +++++++++++++------------
 torchvision/csrc/cpu/video/Video.h      | 13 ++---
 torchvision/csrc/cpu/video/register.cpp |  1 +
 3 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index e073fd62d5c..4367f1943d0 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -192,8 +192,8 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
-  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
+  c10::Dict<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
+  c10::Dict<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
 
 
   // calback and metadata defined in struct
@@ -219,11 +219,14 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
       };
     }
   }
-  audioMetadata.insert({{"duration", audioDuration}, {"framerate", audioFPS}});
-  videoMetadata.insert({{"duration", videoDuration}, {"fps", videoFPS}});
-  streamsMetadata.insert({{"video", videoMetadata}, {"audio", audioMetadata}});
-
-  succeeded = Video::_setCurrentStream();
+  audioMetadata.insert("duration", audioDuration);
+  audioMetadata.insert("framerate", audioFPS);
+  videoMetadata.insert("duration", videoDuration);
+  videoMetadata.insert("fps", videoFPS);
+  streamsMetadata.insert("video", videoMetadata);
+  streamsMetadata.insert("audio", audioMetadata);
+
+  succeeded = Video::setCurrentStream();
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
   if (get<1>(current_stream) != -1) {
     LOG(INFO)
@@ -232,7 +235,12 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   }
 } // video
 
-bool Video::_setCurrentStream() {
+bool Video::setCurrentStream(std::string stream) {
+  
+  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
+    current_stream = _parseStream(stream);
+  }
+
   double ts = 0;
   if (seekTS > 0) {
     ts = seekTS;
@@ -248,6 +256,7 @@ bool Video::_setCurrentStream() {
   );
 
   // calback and metadata defined in Video.h
+  cout << "Decoder init at setStream " << succeeded << "\n" ;
   return (decoder.init(params, std::move(callback), &metadata));
 }
 
@@ -255,35 +264,27 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
   return current_stream;
 }
 
-std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
+c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
   return streamsMetadata;
 }
 
-void Video::Seek(double ts, bool any_frame = false) {
+void Video::Seek(double ts) {
   // initialize the class variables used for seeking and retrurn
-  video_any_frame = any_frame;
-  seekTS = ts;
-  doSeek = true;
-}
-
-std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
-  bool newInit = false; // avoid unnecessary decoder initializations
-  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-    current_stream = _parseStream(stream);
-    newInit = true;
-  }
+  _getDecoderParams(
+      ts, // video start
+      0, // headerOnly
+      get<0>(current_stream), // stream
+      long(get<1>(
+          current_stream)), // stream_id parsed from info above change to -2
+      false // read all streams
+  );
 
-  if ((seekTS != -1) && (doSeek == true)) {
-    newInit = true;
-    doSeek = false;
-  }
+  // calback and metadata defined in Video.h
+  succeeded = decoder.init(params, std::move(callback), &metadata);
+  cout << "Decoder init at seek " << succeeded << "\n" ;
+}
 
-  if (newInit) {
-    succeeded = Video::_setCurrentStream();
-    if (succeeded) {
-      newInit = false;
-    }
-  }
+std::tuple<torch::Tensor, double> Video::Next() {
 
   // if failing to decode simply return a null tensor (note, should we
   // raise an exeption?)
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index b8db513d83f..86244b26a46 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -21,16 +21,17 @@
 using namespace ffmpeg;
 
 struct Video : torch::CustomClassHolder {
-  std::tuple<std::string, long> current_stream; // streaam type, id
+  std::tuple<std::string, long> current_stream; // stream type, id
   // global video metadata
-  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
+  c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
 
  public:
   Video(std::string videoPath, std::string stream, bool isReadFile);
   std::tuple<std::string, int64_t> getCurrentStream() const;
-  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
-  void Seek(double ts, bool any_frame);
-  std::tuple<torch::Tensor, double> Next(std::string stream);
+  c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
+  void Seek(double ts);
+  bool setCurrentStream(std::string stream = "video");
+  std::tuple<torch::Tensor, double> Next();
 
  private:
   bool video_any_frame = false; // add this to input parameters?
@@ -48,7 +49,7 @@ struct Video : torch::CustomClassHolder {
       long stream_id,
       bool all_streams,
       double seekFrameMarginUs); // this needs to be improved
-  bool _setCurrentStream();
+
   std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
   DecoderInCallback callback = nullptr;
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 8f3f46072f5..b5d68bd68c0 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -9,6 +9,7 @@ static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
         .def("get_current_stream", &Video::getCurrentStream)
+        .def("set_current_stream", &Video::setCurrentStream)
         .def("get_metadata", &Video::getStreamMetadata)
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);

From 5de29d051f151dc1bab692f51d88294dc9635a88 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 25 Sep 2020 12:34:47 -0500
Subject: [PATCH 042/128] Adding new tests to check the accuracy of the seek

---
 test/test_video.py | 77 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 7 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 05e75e2f598..ebf0a2a8ab8 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -97,6 +97,39 @@
     # ),
 }
 
+def _template_read_video(video_object, s=0, e=None):
+    
+    if e is None:
+        e = float("inf")
+    if e < s:
+        raise ValueError(
+            "end time should be larger than start time, got "
+            "start time={} and end time={}".format(s, e)
+        )
+
+    video_object.set_current_stream("video")
+    video_object.seek(s)
+    video_frames = torch.empty(0)
+    frames = []
+    t, pts = video_object.next()
+    while t.numel() > 0 and (pts >= s and pts <= e):
+        frames.append(t)
+        t, pts = video_object.next()
+    if len(frames) > 0:
+        video_frames = torch.stack(frames, 0)
+
+    video_object.set_current_stream("audio")
+    video_object.seek(s)
+    audio_frames = torch.empty(0)
+    frames = []
+    t, pts = video_object.next()
+    while t.numel() > 0 and (pts > s and pts <= e):
+        frames.append(t)
+        t, pts = video_object.next()
+    if len(frames) > 0:
+        audio_frames = torch.stack(frames, 0)
+
+    return video_frames, audio_frames, video_object.get_metadata()
 
 @unittest.skipIf(_HAS_VIDEO_OPT is False, "Didn't compile with ffmpeg")
 class TestVideo(unittest.TestCase):
@@ -105,7 +138,6 @@ def test_read_video_tensor(self):
         Check if reading the video using the `next` based API yields the
         same sized and equal tensors as video_reader.
         """
-        print("test read")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
@@ -115,10 +147,10 @@ def test_read_video_tensor(self):
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video", True)
             frames = []
-            t, _ = reader.next("")
+            t, _ = reader.next()
             while t.numel() > 0:
                 frames.append(t)
-                t, _ = reader.next("")
+                t, _ = reader.next()
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
             self.assertEqual(torch.equal(tv_result, new_api), True)
@@ -127,7 +159,6 @@ def test_read_video_tensor(self):
     def test_pts(self):
         """Check if the frames have the same timestamps
         """
-        print("test timestamp")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
@@ -136,10 +167,10 @@ def test_pts(self):
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video", True)
             pts = []
-            t, p = reader.next("")
+            t, p = reader.next()
             while t.numel() > 0:
                 pts.append(p)
-                t, p = reader.next("")
+                t, p = reader.next()
             
             tv_timestamps = [float(p) for p in tv_timestamps]
             napi_pts = [float(p) for p in pts]
@@ -148,7 +179,6 @@ def test_pts(self):
 
     @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
     def test_metadata(self):
-        print("test fps")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
@@ -156,6 +186,39 @@ def test_metadata(self):
             reader_md = reader.get_metadata()
             self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
             self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
+
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_video_reading_fn(self):
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            video, audio, metadata = _template_read_video(reader)
+            tv_video, tv_audio, info = torchvision.io.read_video(full_path, pts_unit="sec")
+
+            self.assertEqual(torch.equal(tv_video.permute(0, 3, 1, 2), video), True)
+            self.assertEqual(torch.equal(tv_audio, audio), True)
+    
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_partial_video_reading_fn(self):
+        import random
+        print("Test video reader")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            # select two random points between 0 and duration
+            r = []
+            r.append(random.uniform(0, config.duration))
+            r.append(random.uniform(0, config.duration))
+            s = min(r)
+            e = max(r)
+
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            video, audio, metadata = _template_read_video(reader, s, e)
+            tv_video, tv_audio, info = torchvision.io.read_video(full_path, start_pts=s, end_pts=e, pts_unit="sec")
+            self.assertAlmostEqual(tv_video.size(0), video.size(0), delta=2.0)
       
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file

From b541943e3402ff874dac4dc94a2ccd0825eb455d Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 25 Sep 2020 12:36:13 -0500
Subject: [PATCH 043/128] cleanup debugging statements

---
 torchvision/csrc/cpu/video/Video.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4367f1943d0..bea0f16e676 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -256,7 +256,6 @@ bool Video::setCurrentStream(std::string stream) {
   );
 
   // calback and metadata defined in Video.h
-  cout << "Decoder init at setStream " << succeeded << "\n" ;
   return (decoder.init(params, std::move(callback), &metadata));
 }
 
@@ -281,7 +280,7 @@ void Video::Seek(double ts) {
 
   // calback and metadata defined in Video.h
   succeeded = decoder.init(params, std::move(callback), &metadata);
-  cout << "Decoder init at seek " << succeeded << "\n" ;
+  LOG(INFO) << "Decoder init at seek " << succeeded << "\n" ;
 }
 
 std::tuple<torch::Tensor, double> Video::Next() {

From 73103f57f6f63684055519d40864d6e237b40bad Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 13 Aug 2020 13:35:26 -0500
Subject: [PATCH 044/128] adding base files

---
 torchvision/csrc/cpu/video/Video.cpp |  0
 torchvision/csrc/cpu/video/Video.h   | 42 ++++++++++++++++++++++++++++
 video_reader.todo                    | 15 ++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 torchvision/csrc/cpu/video/Video.cpp
 create mode 100644 torchvision/csrc/cpu/video/Video.h
 create mode 100644 video_reader.todo

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
new file mode 100644
index 00000000000..13d05a90128
--- /dev/null
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#ifndef VIDEO_H_
+#define VIDEO_H_
+
+
+#include <string>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <Python.h>
+#include <c10/util/Logging.h>
+#include <torch/script.h>
+
+#include "../decoder/Stream.h"
+
+
+
+struct VideoMetadata{
+    double videoFps;  // average frame rate for the video (float)
+    double videoDuration; // real world video duration in seconds (float)
+    double videoStartTime; // video start time in seconds (float)
+    // do we need a constructor here?
+}
+
+class Video {
+    std::vector<VideoMetadata> Metadata;
+    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    public:
+        Video(std::string filename, std::string stream="video");
+        void Seek(double ts, std::string stream="", bool any_frame=False);
+        torch::List<torch::Tensor> Next(std::string stream="")
+        torch::List<torch::Tensor> Peak(std::string stream="")
+    protected:
+        // AV container type (check in decoder for exact type)
+    private:
+        int64_t SecToStream(double ts); // TODO: add stream type
+        float StreamToSec(int64_t pts); // TODO: add stream type
+        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
+} // class Video
+
+#endif  // VIDEO_H_
diff --git a/video_reader.todo b/video_reader.todo
new file mode 100644
index 00000000000..2a01bbde4b8
--- /dev/null
+++ b/video_reader.todo
@@ -0,0 +1,15 @@
+The new API:
+    ☐ the c++ extension is going to live in torchvision/csrc/cpu/video
+    ☐ modification of the build needs to go to setup.py
+    ☐ torchvision/io/_video something needs to happen somehow
+
+Tests changes:
+    ☐ test/test_io.py
+    ☐ test/test_video_reader.py (change to test video api)
+
+
+
+Implementation:
+    ☐ Datatype for strem
+    ☐ Datatype for container
+    ☐ Do I use tensor as a type in metadata 
\ No newline at end of file

From 41efdadb2ade59f6da49ff679a2fd3a0549088a8 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 14 Aug 2020 04:05:59 -0500
Subject: [PATCH 045/128] setup modification to actually build the thing

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1bc84897fa6..183da81fcb7 100644
--- a/setup.py
+++ b/setup.py
@@ -343,10 +343,13 @@ def get_extensions():
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(
             os.path.join(base_decoder_src_dir, "*.cpp"))
+        # Torchvision video API
+        videoapi_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video')
+        videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp"))
         # exclude tests
         base_decoder_src = [x for x in base_decoder_src if '_test.cpp' not in x]
 
-        combined_src = video_reader_src + base_decoder_src
+        combined_src = video_reader_src + base_decoder_src + videoapi_src
 
         ext_modules.append(
             CppExtension(
@@ -355,6 +358,7 @@ def get_extensions():
                 include_dirs=[
                     base_decoder_src_dir,
                     video_reader_src_dir,
+                    videoapi_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],

From d3d7f4321a494f9e21172fe883eeec0f51aa4083 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 17 Aug 2020 10:07:05 -0500
Subject: [PATCH 046/128] video api constructor registration

---
 torchvision/csrc/cpu/video/Video.cpp    | 119 ++++++++++++++++++++++++
 torchvision/csrc/cpu/video/Video.h      |  32 ++++---
 torchvision/csrc/cpu/video/register.cpp |  16 ++++
 3 files changed, 154 insertions(+), 13 deletions(-)
 create mode 100644 torchvision/csrc/cpu/video/register.cpp

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index e69de29bb2d..cb3cfa3e275 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -0,0 +1,119 @@
+
+# include "Video.h"
+#include <torch/script.h>
+#include <c10/util/Logging.h>
+#include "sync_decoder.h"
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+
+using namespace std;
+using namespace ffmpeg;
+
+
+// If we are in a Windows environment, we need to define
+// initialization functions for the _custom_ops extension
+#ifdef _WIN32
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#else
+PyMODINIT_FUNC PyInit_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#endif
+#endif
+
+
+// namespace Video{
+const size_t decoderTimeoutMs = 600000;
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
+const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
+// A jitter can be added to the end of the range to avoid conversion/rounding
+// error, small value 100us won't be enough to select the next frame, but enough
+// to compensate rounding error due to the multiple conversions.
+const size_t timeBaseJitterUs = 100;
+
+void Video::_getDecoderParams(
+        int64_t videoStartUs,
+        int64_t getPtsOnly,
+        // how enum works, but stream type
+        int stream_id=-1,
+        double seekFrameMarginUs=10){
+
+    params.headerOnly = getPtsOnly != 0;
+    params.seekAccuracy = seekFrameMarginUs;
+    params.startOffset = videoStartUs;
+    params.timeoutMs = decoderTimeoutMs;
+    params.preventStaleness = false;  // not sure what this is about
+
+    // define the stream using the correct parsing technique
+} // _get decoder params
+
+
+Video::Video(
+    std::string videoPath, 
+    std::string stream, 
+    bool isReadFile, 
+    int64_t audioSamples=0, 
+    int64_t audioChannels=1) {
+
+
+    //parse stream information
+
+    // set current stream
+    DecoderParameters params;
+    Video::_getDecoderParams(
+        0,   // video start
+        false,  //headerOnly
+        // stream_type parsed from info above
+        // stream_id parsed from info above
+        audioSamples,
+        audioChannels
+    );
+
+    std::string logMessage, logType;
+    DecoderInCallback callback = nullptr;
+    // TODO: add read from memory option
+    params.uri = videoPath;
+    logType = "file";
+    logMessage = videoPath;
+    
+
+    // get a decoder
+    SyncDecoder decoder;
+    bool succeeded;
+
+    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+          << "] has started";
+
+    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    std::vector<DecoderMetadata> metadata;
+    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
+        for (const auto& header : metadata) {
+            VLOG(1) << "Decoding stream of" << header.format.type ;
+        if (header.format.type == TYPE_VIDEO) {
+            videoMetadata = header;
+        } else if (header.format.type == TYPE_AUDIO) {
+            audioMetadata = header;
+        } else {
+            dataMetadata = header;
+        };
+        }
+    } 
+} //video
+
+// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
+// }
+
+// torch::List<torch::Tensor> Video::Next(){
+//     return
+// }
+
+
+
+// }; // namespace video
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 13d05a90128..00e3232de26 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -12,7 +12,12 @@
 #include <c10/util/Logging.h>
 #include <torch/script.h>
 
-#include "../decoder/Stream.h"
+#include <exception>
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+using namespace ffmpeg;
 
 
 
@@ -21,22 +26,23 @@ struct VideoMetadata{
     double videoDuration; // real world video duration in seconds (float)
     double videoStartTime; // video start time in seconds (float)
     // do we need a constructor here?
-}
+};
 
-class Video {
+struct Video : torch::CustomClassHolder {
     std::vector<VideoMetadata> Metadata;
-    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // std::vector<Stream> AvailStreams;  // TODO: add stream type
     public:
-        Video(std::string filename, std::string stream="video");
-        void Seek(double ts, std::string stream="", bool any_frame=False);
-        torch::List<torch::Tensor> Next(std::string stream="")
-        torch::List<torch::Tensor> Peak(std::string stream="")
-    protected:
+        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        // void Seek(double ts, std::string stream="", bool any_frame=False);
+        // torch::List<torch::Tensor> Next(std::string stream="")
+        // torch::List<torch::Tensor> Peak(std::string stream="")
+    // protected:
         // AV container type (check in decoder for exact type)
     private:
-        int64_t SecToStream(double ts); // TODO: add stream type
-        float StreamToSec(int64_t pts); // TODO: add stream type
-        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
-} // class Video
+        DecoderParameters params;
+        // int64_t SecToStream(double ts); // TODO: add stream type
+        // float StreamToSec(int64_t pts); // TODO: add stream type
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+}; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
new file mode 100644
index 00000000000..e4dde7a5530
--- /dev/null
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -0,0 +1,16 @@
+#ifndef REGISTER_H
+#define REGISTER_H
+
+#include "Video.h"
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// typedefs.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
+
+} //namespace
+#endif

From c6ea6daf81826fbc57656156096ac8ab1570b98b Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:27:47 -0500
Subject: [PATCH 047/128] FAIL metadata

---
 torchvision/csrc/cpu/video/Video.cpp    | 91 ++++++++++++++++++-------
 torchvision/csrc/cpu/video/Video.h      | 29 +++++---
 torchvision/csrc/cpu/video/register.cpp | 13 ++--
 3 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index cb3cfa3e275..fb7a6891a6b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,7 +29,6 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 #endif
 
 
-// namespace Video{
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
@@ -43,6 +42,7 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         // how enum works, but stream type
         int stream_id=-1,
+        bool all_streams=false,
         double seekFrameMarginUs=10){
 
     params.headerOnly = getPtsOnly != 0;
@@ -51,6 +51,24 @@ void Video::_getDecoderParams(
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
+    if (all_streams == true){
+        MediaFormat audioFormat((long) -2);
+        audioFormat.type = TYPE_AUDIO;
+        audioFormat.format.audio.format = defaultAudioSampleFormat;
+        params.formats.insert(audioFormat);
+
+        MediaFormat videoFormat(0, (long) -2);
+        videoFormat.type = TYPE_VIDEO;
+        videoFormat.format.video.format = defaultVideoPixelFormat;
+        params.formats.insert(videoFormat);
+
+        // MediaFormat subtitleFormat("0", (long) -2);
+        // subtitleFormat.type = TYPE_SUBTITLE;
+        // MediaFormat ccFormat((double) 0, (long) -2);
+        // ccFormat.type = TYPE_CC;
+
+    }
+
     // define the stream using the correct parsing technique
 } // _get decoder params
 
@@ -58,22 +76,20 @@ void Video::_getDecoderParams(
 Video::Video(
     std::string videoPath, 
     std::string stream, 
-    bool isReadFile, 
-    int64_t audioSamples=0, 
-    int64_t audioChannels=1) {
+    bool isReadFile) {
 
 
     //parse stream information
 
     // set current stream
+    // note that in the initial version we want to get all streams
     DecoderParameters params;
     Video::_getDecoderParams(
-        0,   // video start
+        0,      // video start
         false,  //headerOnly
         // stream_type parsed from info above
-        // stream_id parsed from info above
-        audioSamples,
-        audioChannels
+        -2,     // stream_id parsed from info above
+        true    // read all streams
     );
 
     std::string logMessage, logType;
@@ -88,32 +104,55 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
-
-    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    
+    std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
             VLOG(1) << "Decoding stream of" << header.format.type ;
-        if (header.format.type == TYPE_VIDEO) {
-            videoMetadata = header;
-        } else if (header.format.type == TYPE_AUDIO) {
-            audioMetadata = header;
-        } else {
-            dataMetadata = header;
-        };
+        
+            // generate streamMetadata object
+            StreamMetadata streamInfo;
+            // parse stream timebase
+            torch::Tensor timeBase = torch::zeros({1}, torch::kFloat);
+            float * timeBaseData = timeBase.data_ptr<float>();
+            timeBaseData[0] = header.num / header.den;
+            streamInfo.timeBase = timeBase;
+            // parse stream duration
+            torch::Tensor duration = torch::zeros({1}, torch::kFloat);
+            float* durationData = duration.data_ptr<float>();
+            durationData[0] = (float) header.duration;
+            // to get duration in seconds multiply duration by timebase
+            streamInfo.duration = duration * streamInfo.timeBase;
+            
+            if (header.format.type == TYPE_VIDEO) {
+                // parse stream fps
+                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
+                float* frameRateData = frameRate.data_ptr<float>();
+                frameRateData[0] = header.fps;
+                streamInfo.frameRate = frameRate;
+                videoStreams.push_back(streamInfo);
+            } else if (header.format.type == TYPE_AUDIO) {
+                const auto& format = header.format.format.audio;
+                // parse stream fps
+                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
+                float* frameRateData = frameRate.data_ptr<float>();
+                frameRateData[0] = (float) format.samples;
+                streamInfo.frameRate = frameRate;
+                audioStreams.push_back(streamInfo);
+            };
         }
+        VideoMetadata.insert({"video", videoStreams});
+        VideoMetadata.insert({"autio", audioStreams});
     } 
 } //video
 
-// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
-// }
-
-// torch::List<torch::Tensor> Video::Next(){
-//     return
-// }
-
+// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+int Video::getMetadata() {
+    // return VideoMetadata;
+    return 5;
+}
 
 
-// }; // namespace video
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 00e3232de26..f9c104c1217 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,6 +11,8 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
+#include <torch/custom_class.h>
+
 
 #include <exception>
 #include "sync_decoder.h"
@@ -21,18 +23,29 @@ using namespace ffmpeg;
 
 
 
-struct VideoMetadata{
-    double videoFps;  // average frame rate for the video (float)
-    double videoDuration; // real world video duration in seconds (float)
-    double videoStartTime; // video start time in seconds (float)
+struct StreamMetadata{
+    torch::Tensor frameRate;  // average frame rate for the video (float)
+    torch::Tensor duration; // real world video duration in seconds (float)
+    // torch::Tensor startTime; // video start time in seconds (float)
+    torch::Tensor timeBase;
     // do we need a constructor here?
+    explicit StreamMetadata(){
+        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
+        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
+        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
+    }
 };
 
+
 struct Video : torch::CustomClassHolder {
-    std::vector<VideoMetadata> Metadata;
-    // std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // metadata is defined as a dictionary where every 
+    // type has a vector containing metadata for that stream
+    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
+    
     public:
-        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        Video(std::string videoPath, std::string stream, bool isReadFile);
+        int getMetadata();
+        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
@@ -42,7 +55,7 @@ struct Video : torch::CustomClassHolder {
         DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
 }; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index e4dde7a5530..4c6cc6c09cd 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,14 +3,11 @@
 
 #include "Video.h"
 
-namespace {
 
-////////////////////////////////////////////////////////////////////////////////
-// typedefs.h
-////////////////////////////////////////////////////////////////////////////////
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
+TORCH_LIBRARY(torchvision, m) {
 
-} //namespace
+    m.class_<Video>("video")
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
+}
 #endif

From bd2d78e9e9b794305222ef3855729062fe4ac15f Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:29:15 -0500
Subject: [PATCH 048/128] FAIL update for QS

---
 torchvision/csrc/cpu/video/Video.cpp | 10 +++++-----
 torchvision/csrc/cpu/video/Video.h   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index fb7a6891a6b..f86ac66375b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -145,14 +145,14 @@ Video::Video(
             };
         }
         VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"autio", audioStreams});
+        VideoMetadata.insert({"audio", audioStreams});
     } 
 } //video
 
-// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-int Video::getMetadata() {
-    // return VideoMetadata;
-    return 5;
+std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+// int Video::getMetadata() {
+    return VideoMetadata;
+    // return 5;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index f9c104c1217..9e02afe9713 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -44,7 +44,7 @@ struct Video : torch::CustomClassHolder {
     
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
-        int getMetadata();
+        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")

From f96ecffb421799266cca3c66dd138acc2dbf7c1e Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 05:23:51 -0500
Subject: [PATCH 049/128] revert

---
 torchvision/csrc/cpu/video/Video.cpp    |  4 ++--
 torchvision/csrc/cpu/video/register.cpp | 14 +++++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index f86ac66375b..4f871e5131c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -104,14 +104,14 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
+    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
     std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
-            VLOG(1) << "Decoding stream of" << header.format.type ;
+            cout << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
             StreamMetadata streamInfo;
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 4c6cc6c09cd..a8343762388 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,11 +3,15 @@
 
 #include "Video.h"
 
+namespace {
 
-TORCH_LIBRARY(torchvision, m) {
+////////////////////////////////////////////////////////////////////////////////
+// typedefs.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool>());
+        // .def("get_metadata", &Video::getMetadata);
 
-    m.class_<Video>("video")
-        .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
-}
+} //namespace
 #endif

From fd6d0ddbd7bfecd5d5a23051d31a3b01e2634f2a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 06:32:23 -0500
Subject: [PATCH 050/128] debugging with Victor

---
 torchvision/csrc/cpu/video/Video.cpp    |  8 ++++----
 torchvision/csrc/cpu/video/Video.h      | 16 +++++++---------
 torchvision/csrc/cpu/video/register.cpp |  7 ++-----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4f871e5131c..d62ece7a9e1 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -62,6 +62,7 @@ void Video::_getDecoderParams(
         videoFormat.format.video.format = defaultVideoPixelFormat;
         params.formats.insert(videoFormat);
 
+        // there is no clear way on how to use other formats- todo later
         // MediaFormat subtitleFormat("0", (long) -2);
         // subtitleFormat.type = TYPE_SUBTITLE;
         // MediaFormat ccFormat((double) 0, (long) -2);
@@ -69,7 +70,8 @@ void Video::_getDecoderParams(
 
     }
 
-    // define the stream using the correct parsing technique
+    // else use the stream using the correct parsing technique
+
 } // _get decoder params
 
 
@@ -139,7 +141,7 @@ Video::Video(
                 // parse stream fps
                 torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
                 float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples;
+                frameRateData[0] = (float) format.samples; // this is user defined? 
                 streamInfo.frameRate = frameRate;
                 audioStreams.push_back(streamInfo);
             };
@@ -150,9 +152,7 @@ Video::Video(
 } //video
 
 std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-// int Video::getMetadata() {
     return VideoMetadata;
-    // return 5;
 }
 
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 9e02afe9713..92a8e939918 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,7 +11,6 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
-#include <torch/custom_class.h>
 
 
 #include <exception>
@@ -22,7 +21,6 @@
 using namespace ffmpeg;
 
 
-
 struct StreamMetadata{
     torch::Tensor frameRate;  // average frame rate for the video (float)
     torch::Tensor duration; // real world video duration in seconds (float)
@@ -37,25 +35,25 @@ struct StreamMetadata{
 };
 
 
+
 struct Video : torch::CustomClassHolder {
     // metadata is defined as a dictionary where every 
     // type has a vector containing metadata for that stream
     std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
     
-    public:
-        Video(std::string videoPath, std::string stream, bool isReadFile);
-        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
+    Video(std::string videoPath, std::string stream, bool isReadFile);
+    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
     // protected:
         // AV container type (check in decoder for exact type)
-    private:
-        DecoderParameters params;
+    DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-}; // class Video
+    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+}; // struct Video
+
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index a8343762388..357f4ccfe4c 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -5,13 +5,10 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
-// typedefs.h
-////////////////////////////////////////////////////////////////////////////////
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool>());
-        // .def("get_metadata", &Video::getMetadata);
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
 
 } //namespace
 #endif

From 4f04fc8107d3a52ffa7a82bdc9f61a11061d4019 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 13 Aug 2020 13:35:26 -0500
Subject: [PATCH 051/128] adding base files

---
 torchvision/csrc/cpu/video/Video.h | 58 +++++++++++-------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 92a8e939918..224a0639be5 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -13,47 +13,31 @@
 #include <torch/script.h>
 
 
-#include <exception>
-#include "sync_decoder.h"
-#include "memory_buffer.h"
-#include "defs.h"
+#include "../decoder/Stream.h"
 
-using namespace ffmpeg;
 
 
-struct StreamMetadata{
-    torch::Tensor frameRate;  // average frame rate for the video (float)
-    torch::Tensor duration; // real world video duration in seconds (float)
-    // torch::Tensor startTime; // video start time in seconds (float)
-    torch::Tensor timeBase;
+struct VideoMetadata{
+    double videoFps;  // average frame rate for the video (float)
+    double videoDuration; // real world video duration in seconds (float)
+    double videoStartTime; // video start time in seconds (float)
     // do we need a constructor here?
-    explicit StreamMetadata(){
-        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
-        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
-        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
-    }
-};
-
-
-
-struct Video : torch::CustomClassHolder {
-    // metadata is defined as a dictionary where every 
-    // type has a vector containing metadata for that stream
-    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
-    
-    Video(std::string videoPath, std::string stream, bool isReadFile);
-    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
-        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
-        // void Seek(double ts, std::string stream="", bool any_frame=False);
-        // torch::List<torch::Tensor> Next(std::string stream="")
-        // torch::List<torch::Tensor> Peak(std::string stream="")
-    // protected:
+}
+
+class Video {
+    std::vector<VideoMetadata> Metadata;
+    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    public:
+        Video(std::string filename, std::string stream="video");
+        void Seek(double ts, std::string stream="", bool any_frame=False);
+        torch::List<torch::Tensor> Next(std::string stream="")
+        torch::List<torch::Tensor> Peak(std::string stream="")
+    protected:
         // AV container type (check in decoder for exact type)
-    DecoderParameters params;
-        // int64_t SecToStream(double ts); // TODO: add stream type
-        // float StreamToSec(int64_t pts); // TODO: add stream type
-    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-}; // struct Video
-
+    private:
+        int64_t SecToStream(double ts); // TODO: add stream type
+        float StreamToSec(int64_t pts); // TODO: add stream type
+        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
+} // class Video
 
 #endif  // VIDEO_H_

From 399d0e5fcc56f0fde83a1c6f14e938aa53363df3 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 17 Aug 2020 10:07:05 -0500
Subject: [PATCH 052/128] video api constructor registration

---
 torchvision/csrc/cpu/video/Video.cpp    | 68 ++++++++++++++++---------
 torchvision/csrc/cpu/video/Video.h      | 32 +++++++-----
 torchvision/csrc/cpu/video/register.cpp |  3 +-
 3 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d62ece7a9e1..dd461fa2278 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,6 +29,7 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 #endif
 
 
+
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
@@ -42,7 +43,7 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         // how enum works, but stream type
         int stream_id=-1,
-        bool all_streams=false,
+
         double seekFrameMarginUs=10){
 
     params.headerOnly = getPtsOnly != 0;
@@ -51,40 +52,22 @@ void Video::_getDecoderParams(
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
-    if (all_streams == true){
-        MediaFormat audioFormat((long) -2);
-        audioFormat.type = TYPE_AUDIO;
-        audioFormat.format.audio.format = defaultAudioSampleFormat;
-        params.formats.insert(audioFormat);
-
-        MediaFormat videoFormat(0, (long) -2);
-        videoFormat.type = TYPE_VIDEO;
-        videoFormat.format.video.format = defaultVideoPixelFormat;
-        params.formats.insert(videoFormat);
-
-        // there is no clear way on how to use other formats- todo later
-        // MediaFormat subtitleFormat("0", (long) -2);
-        // subtitleFormat.type = TYPE_SUBTITLE;
-        // MediaFormat ccFormat((double) 0, (long) -2);
-        // ccFormat.type = TYPE_CC;
-
-    }
-
-    // else use the stream using the correct parsing technique
-
 } // _get decoder params
 
 
 Video::Video(
     std::string videoPath, 
     std::string stream, 
-    bool isReadFile) {
+
+    bool isReadFile, 
+    int64_t audioSamples=0, 
+    int64_t audioChannels=1) {
 
 
     //parse stream information
 
     // set current stream
-    // note that in the initial version we want to get all streams
+
     DecoderParameters params;
     Video::_getDecoderParams(
         0,      // video start
@@ -92,6 +75,14 @@ Video::Video(
         // stream_type parsed from info above
         -2,     // stream_id parsed from info above
         true    // read all streams
+    DecoderParameters params;
+    Video::_getDecoderParams(
+        0,   // video start
+        false,  //headerOnly
+        // stream_type parsed from info above
+        // stream_id parsed from info above
+        audioSamples,
+        audioChannels
     );
 
     std::string logMessage, logType;
@@ -156,3 +147,32 @@ std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
 }
 
 
+    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+          << "] has started";
+
+    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
+    std::vector<DecoderMetadata> metadata;
+    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
+        for (const auto& header : metadata) {
+            VLOG(1) << "Decoding stream of" << header.format.type ;
+        if (header.format.type == TYPE_VIDEO) {
+            videoMetadata = header;
+        } else if (header.format.type == TYPE_AUDIO) {
+            audioMetadata = header;
+        } else {
+            dataMetadata = header;
+        };
+        }
+    } 
+} //video
+
+// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
+// }
+
+// torch::List<torch::Tensor> Video::Next(){
+//     return
+// }
+
+
+
+// }; // namespace video
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 224a0639be5..2f5c89249a0 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -13,7 +13,12 @@
 #include <torch/script.h>
 
 
-#include "../decoder/Stream.h"
+#include <exception>
+#include "sync_decoder.h"
+#include "memory_buffer.h"
+#include "defs.h"
+
+using namespace ffmpeg;
 
 
 
@@ -22,22 +27,23 @@ struct VideoMetadata{
     double videoDuration; // real world video duration in seconds (float)
     double videoStartTime; // video start time in seconds (float)
     // do we need a constructor here?
-}
+};
 
-class Video {
+struct Video : torch::CustomClassHolder {
     std::vector<VideoMetadata> Metadata;
-    std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // std::vector<Stream> AvailStreams;  // TODO: add stream type
     public:
-        Video(std::string filename, std::string stream="video");
-        void Seek(double ts, std::string stream="", bool any_frame=False);
-        torch::List<torch::Tensor> Next(std::string stream="")
-        torch::List<torch::Tensor> Peak(std::string stream="")
-    protected:
+        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        // void Seek(double ts, std::string stream="", bool any_frame=False);
+        // torch::List<torch::Tensor> Next(std::string stream="")
+        // torch::List<torch::Tensor> Peak(std::string stream="")
+    // protected:
         // AV container type (check in decoder for exact type)
     private:
-        int64_t SecToStream(double ts); // TODO: add stream type
-        float StreamToSec(int64_t pts); // TODO: add stream type
-        void SetVideoStream(std::string stream="video:0")  // this needs to be improved
-} // class Video
+        DecoderParameters params;
+        // int64_t SecToStream(double ts); // TODO: add stream type
+        // float StreamToSec(int64_t pts); // TODO: add stream type
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+}; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 357f4ccfe4c..b421d65fee7 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -7,8 +7,7 @@ namespace {
 
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
+        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
 
 } //namespace
 #endif

From 8f6c7f927d8e88e5d41f773fa118ca8f2216d18a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:27:47 -0500
Subject: [PATCH 053/128] FAIL metadata

---
 torchvision/csrc/cpu/video/Video.cpp    | 88 ++++++++++++++-----------
 torchvision/csrc/cpu/video/Video.h      | 29 +++++---
 torchvision/csrc/cpu/video/register.cpp | 10 ++-
 3 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index dd461fa2278..26794837687 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,7 +29,10 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 #endif
 
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> FAIL metadata
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
@@ -43,7 +46,11 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         // how enum works, but stream type
         int stream_id=-1,
+<<<<<<< HEAD
 
+=======
+        bool all_streams=false,
+>>>>>>> FAIL metadata
         double seekFrameMarginUs=10){
 
     params.headerOnly = getPtsOnly != 0;
@@ -52,21 +59,48 @@ void Video::_getDecoderParams(
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
+<<<<<<< HEAD
+=======
+    if (all_streams == true){
+        MediaFormat audioFormat((long) -2);
+        audioFormat.type = TYPE_AUDIO;
+        audioFormat.format.audio.format = defaultAudioSampleFormat;
+        params.formats.insert(audioFormat);
+
+        MediaFormat videoFormat(0, (long) -2);
+        videoFormat.type = TYPE_VIDEO;
+        videoFormat.format.video.format = defaultVideoPixelFormat;
+        params.formats.insert(videoFormat);
+
+        // MediaFormat subtitleFormat("0", (long) -2);
+        // subtitleFormat.type = TYPE_SUBTITLE;
+        // MediaFormat ccFormat((double) 0, (long) -2);
+        // ccFormat.type = TYPE_CC;
+
+    }
+
+    // define the stream using the correct parsing technique
+>>>>>>> FAIL metadata
 } // _get decoder params
 
 
 Video::Video(
     std::string videoPath, 
     std::string stream, 
+<<<<<<< HEAD
 
     bool isReadFile, 
     int64_t audioSamples=0, 
     int64_t audioChannels=1) {
+=======
+    bool isReadFile) {
+>>>>>>> FAIL metadata
 
 
     //parse stream information
 
     // set current stream
+<<<<<<< HEAD
 
     DecoderParameters params;
     Video::_getDecoderParams(
@@ -75,14 +109,16 @@ Video::Video(
         // stream_type parsed from info above
         -2,     // stream_id parsed from info above
         true    // read all streams
+=======
+    // note that in the initial version we want to get all streams
+>>>>>>> FAIL metadata
     DecoderParameters params;
     Video::_getDecoderParams(
-        0,   // video start
+        0,      // video start
         false,  //headerOnly
         // stream_type parsed from info above
-        // stream_id parsed from info above
-        audioSamples,
-        audioChannels
+        -2,     // stream_id parsed from info above
+        true    // read all streams
     );
 
     std::string logMessage, logType;
@@ -97,14 +133,14 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
+    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
     std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
-            cout << "Decoding stream of" << header.format.type ;
+            VLOG(1) << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
             StreamMetadata streamInfo;
@@ -132,47 +168,19 @@ Video::Video(
                 // parse stream fps
                 torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
                 float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples; // this is user defined? 
+                frameRateData[0] = (float) format.samples;
                 streamInfo.frameRate = frameRate;
                 audioStreams.push_back(streamInfo);
             };
         }
         VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"audio", audioStreams});
+        VideoMetadata.insert({"autio", audioStreams});
     } 
 } //video
 
-std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-    return VideoMetadata;
+// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+int Video::getMetadata() {
+    // return VideoMetadata;
+    return 5;
 }
 
-
-    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has started";
-
-    DecoderMetadata audioMetadata, videoMetadata, dataMetadata;
-    std::vector<DecoderMetadata> metadata;
-    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-        for (const auto& header : metadata) {
-            VLOG(1) << "Decoding stream of" << header.format.type ;
-        if (header.format.type == TYPE_VIDEO) {
-            videoMetadata = header;
-        } else if (header.format.type == TYPE_AUDIO) {
-            audioMetadata = header;
-        } else {
-            dataMetadata = header;
-        };
-        }
-    } 
-} //video
-
-// void Video::Seek(float time_s, std::string stream="", bool any_frame=False){
-// }
-
-// torch::List<torch::Tensor> Video::Next(){
-//     return
-// }
-
-
-
-// }; // namespace video
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 2f5c89249a0..9a10ced2a57 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,6 +11,8 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
+#include <torch/custom_class.h>
+
 
 
 #include <exception>
@@ -22,18 +24,29 @@ using namespace ffmpeg;
 
 
 
-struct VideoMetadata{
-    double videoFps;  // average frame rate for the video (float)
-    double videoDuration; // real world video duration in seconds (float)
-    double videoStartTime; // video start time in seconds (float)
+struct StreamMetadata{
+    torch::Tensor frameRate;  // average frame rate for the video (float)
+    torch::Tensor duration; // real world video duration in seconds (float)
+    // torch::Tensor startTime; // video start time in seconds (float)
+    torch::Tensor timeBase;
     // do we need a constructor here?
+    explicit StreamMetadata(){
+        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
+        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
+        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
+    }
 };
 
+
 struct Video : torch::CustomClassHolder {
-    std::vector<VideoMetadata> Metadata;
-    // std::vector<Stream> AvailStreams;  // TODO: add stream type
+    // metadata is defined as a dictionary where every 
+    // type has a vector containing metadata for that stream
+    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
+    
     public:
-        Video(std::string videoPath, std::string stream, bool isReadFile, int64_t audioSamples, int64_t audioChannels);
+        Video(std::string videoPath, std::string stream, bool isReadFile);
+        int getMetadata();
+        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
@@ -43,7 +56,7 @@ struct Video : torch::CustomClassHolder {
         DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
 }; // class Video
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index b421d65fee7..b35ec62b589 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,11 +3,9 @@
 
 #include "Video.h"
 
-namespace {
 
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool, int64_t, int64_t>());
-
-} //namespace
+    m.class_<Video>("video")
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
+}
 #endif

From 68bc32b5d922681dc40eadc4556dd82037084dc4 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 19 Aug 2020 11:29:15 -0500
Subject: [PATCH 054/128] FAIL update for QS

---
 torchvision/csrc/cpu/video/Video.cpp | 10 +++++-----
 torchvision/csrc/cpu/video/Video.h   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 26794837687..0a7cae793d7 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -174,13 +174,13 @@ Video::Video(
             };
         }
         VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"autio", audioStreams});
+        VideoMetadata.insert({"audio", audioStreams});
     } 
 } //video
 
-// // std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-int Video::getMetadata() {
-    // return VideoMetadata;
-    return 5;
+std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
+// int Video::getMetadata() {
+    return VideoMetadata;
+    // return 5;
 }
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 9a10ced2a57..b65ad27ad86 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -45,7 +45,7 @@ struct Video : torch::CustomClassHolder {
     
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
-        int getMetadata();
+        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")

From 4cc2895e7e0136b421ed778831304ee664b4b77a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 05:23:51 -0500
Subject: [PATCH 055/128] revert

---
 torchvision/csrc/cpu/video/Video.cpp    |  4 ++--
 torchvision/csrc/cpu/video/register.cpp | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 0a7cae793d7..6377b041a1d 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -133,14 +133,14 @@ Video::Video(
     SyncDecoder decoder;
     bool succeeded;
 
-    VLOG(1) << "Video decoding to gather metadata from " << logType << " [" << logMessage
+    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
     std::vector<StreamMetadata> videoStreams, audioStreams;
     std::vector<DecoderMetadata> metadata;
     if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
         for (const auto& header : metadata) {
-            VLOG(1) << "Decoding stream of" << header.format.type ;
+            cout << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
             StreamMetadata streamInfo;
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index b35ec62b589..8712012c621 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -3,9 +3,12 @@
 
 #include "Video.h"
 
+namespace {
 
-    m.class_<Video>("video")
-        .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
-}
+static auto registerVideo =
+    torch::class_<Video>("torchvision", "Video")
+        .def(torch::init<std::string, std::string, bool>());
+        // .def("get_metadata", &Video::getMetadata);
+
+} //namespace
 #endif

From 720c3271cd7874cb49f574b00c72a3de59136335 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 20 Aug 2020 06:32:23 -0500
Subject: [PATCH 056/128] debugging with Victor

---
 torchvision/csrc/cpu/video/Video.cpp    | 37 ++++---------------------
 torchvision/csrc/cpu/video/Video.h      | 16 +++++------
 torchvision/csrc/cpu/video/register.cpp |  4 +--
 3 files changed, 15 insertions(+), 42 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 6377b041a1d..1a434c7dba1 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,10 +29,7 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 #endif
 
 
-<<<<<<< HEAD
 
-=======
->>>>>>> FAIL metadata
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
@@ -46,11 +43,8 @@ void Video::_getDecoderParams(
         int64_t getPtsOnly,
         // how enum works, but stream type
         int stream_id=-1,
-<<<<<<< HEAD
 
-=======
         bool all_streams=false,
->>>>>>> FAIL metadata
         double seekFrameMarginUs=10){
 
     params.headerOnly = getPtsOnly != 0;
@@ -59,8 +53,7 @@ void Video::_getDecoderParams(
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
-<<<<<<< HEAD
-=======
+
     if (all_streams == true){
         MediaFormat audioFormat((long) -2);
         audioFormat.type = TYPE_AUDIO;
@@ -72,6 +65,7 @@ void Video::_getDecoderParams(
         videoFormat.format.video.format = defaultVideoPixelFormat;
         params.formats.insert(videoFormat);
 
+        // there is no clear way on how to use other formats- todo later
         // MediaFormat subtitleFormat("0", (long) -2);
         // subtitleFormat.type = TYPE_SUBTITLE;
         // MediaFormat ccFormat((double) 0, (long) -2);
@@ -79,39 +73,22 @@ void Video::_getDecoderParams(
 
     }
 
-    // define the stream using the correct parsing technique
->>>>>>> FAIL metadata
+
+    // else use the stream using the correct parsing technique
+
 } // _get decoder params
 
 
 Video::Video(
     std::string videoPath, 
     std::string stream, 
-<<<<<<< HEAD
-
-    bool isReadFile, 
-    int64_t audioSamples=0, 
-    int64_t audioChannels=1) {
-=======
     bool isReadFile) {
->>>>>>> FAIL metadata
 
 
     //parse stream information
 
     // set current stream
-<<<<<<< HEAD
 
-    DecoderParameters params;
-    Video::_getDecoderParams(
-        0,      // video start
-        false,  //headerOnly
-        // stream_type parsed from info above
-        -2,     // stream_id parsed from info above
-        true    // read all streams
-=======
-    // note that in the initial version we want to get all streams
->>>>>>> FAIL metadata
     DecoderParameters params;
     Video::_getDecoderParams(
         0,      // video start
@@ -168,7 +145,7 @@ Video::Video(
                 // parse stream fps
                 torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
                 float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples;
+                frameRateData[0] = (float) format.samples; // this is user defined? 
                 streamInfo.frameRate = frameRate;
                 audioStreams.push_back(streamInfo);
             };
@@ -179,8 +156,6 @@ Video::Video(
 } //video
 
 std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-// int Video::getMetadata() {
     return VideoMetadata;
-    // return 5;
 }
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index b65ad27ad86..ac6fc6c3036 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -11,7 +11,6 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <torch/script.h>
-#include <torch/custom_class.h>
 
 
 
@@ -23,7 +22,6 @@
 using namespace ffmpeg;
 
 
-
 struct StreamMetadata{
     torch::Tensor frameRate;  // average frame rate for the video (float)
     torch::Tensor duration; // real world video duration in seconds (float)
@@ -38,25 +36,25 @@ struct StreamMetadata{
 };
 
 
+
 struct Video : torch::CustomClassHolder {
     // metadata is defined as a dictionary where every 
     // type has a vector containing metadata for that stream
     std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
     
-    public:
-        Video(std::string videoPath, std::string stream, bool isReadFile);
-        std::map<std::string, std::vector<StreamMetadata>> getMetadata();
+    Video(std::string videoPath, std::string stream, bool isReadFile);
+    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
     // protected:
         // AV container type (check in decoder for exact type)
-    private:
-        DecoderParameters params;
+    DecoderParameters params;
         // int64_t SecToStream(double ts); // TODO: add stream type
         // float StreamToSec(int64_t pts); // TODO: add stream type
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-}; // class Video
+    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+}; // struct Video
+
 
 #endif  // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 8712012c621..357f4ccfe4c 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -7,8 +7,8 @@ namespace {
 
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool>());
-        // .def("get_metadata", &Video::getMetadata);
+        .def(torch::init<std::string, std::string, bool>())
+        .def("get_metadata", &Video::getMetadata);
 
 } //namespace
 #endif

From 9a5aa7cb6988805bf8451b0b4a9c438747ad138a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 21 Aug 2020 05:14:22 -0500
Subject: [PATCH 057/128] metadata registration works

---
 torchvision/csrc/cpu/video/Video.cpp    | 151 +++++++++++++++++-------
 torchvision/csrc/cpu/video/Video.h      |  43 ++++---
 torchvision/csrc/cpu/video/register.cpp |   3 +-
 3 files changed, 132 insertions(+), 65 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 1a434c7dba1..b76e834e46e 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -38,12 +38,78 @@ const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // to compensate rounding error due to the multiple conversions.
 const size_t timeBaseJitterUs = 100;
 
+
+std::string parse_type_to_string(const std::string& stream_string) {
+  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
+      {"video", TYPE_VIDEO},
+      {"audio", TYPE_AUDIO},
+      {"subtitle", TYPE_SUBTITLE},
+      {"cc", TYPE_CC},
+  }};
+  auto device = std::find_if(
+      types.begin(),
+      types.end(),
+      [stream_string](const std::pair<std::string, MediaType>& p) {
+        return p.first == stream_string;
+      });
+  if (device != types.end()) {
+    return device->first;
+  }
+  AT_ERROR(
+      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+}
+
+MediaType parse_type_to_mt(const std::string& stream_string) {
+  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
+      {"video", TYPE_VIDEO},
+      {"audio", TYPE_AUDIO},
+      {"subtitle", TYPE_SUBTITLE},
+      {"cc", TYPE_CC},
+  }};
+  auto device = std::find_if(
+      types.begin(),
+      types.end(),
+      [stream_string](const std::pair<std::string, MediaType>& p) {
+        return p.first == stream_string;
+      });
+  if (device != types.end()) {
+    return device->second;
+  }
+  AT_ERROR(
+      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+}
+
+std::tuple<std::string, int64_t> Video::_parseStream(const std::string& streamString){
+    TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
+    static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
+    std::smatch match;
+
+    TORCH_CHECK(
+        std::regex_match(streamString, match, regex),
+        "Invalid stream string: '", streamString, "'");
+    
+    std::string type_ = "video";
+    type_ = parse_type_to_string(match[1].str());
+    int64_t index_ = -1;
+    if (match[2].matched) {
+        try {
+        index_ = c10::stoi(match[2].str());
+        } catch (const std::exception &) {
+        AT_ERROR(
+            "Could not parse device index '", match[2].str(),
+            "' in device string '", streamString, "'");
+        }
+    }
+    return std::make_tuple(type_, index_);
+}
+
+
 void Video::_getDecoderParams(
         int64_t videoStartUs,
         int64_t getPtsOnly,
-        // how enum works, but stream type
-        int stream_id=-1,
 
+        std::string stream,
+        long stream_id=-1,
         bool all_streams=false,
         double seekFrameMarginUs=10){
 
@@ -66,10 +132,13 @@ void Video::_getDecoderParams(
         params.formats.insert(videoFormat);
 
         // there is no clear way on how to use other formats- todo later
-        // MediaFormat subtitleFormat("0", (long) -2);
-        // subtitleFormat.type = TYPE_SUBTITLE;
-        // MediaFormat ccFormat((double) 0, (long) -2);
-        // ccFormat.type = TYPE_CC;
+        MediaFormat subtitleFormat(char('0'), long(-2));
+        subtitleFormat.type = TYPE_SUBTITLE;
+        params.formats.insert(subtitleFormat);
+
+        MediaFormat ccFormat(double(0), long(-2));
+        ccFormat.type = TYPE_CC;
+        params.formats.insert(ccFormat);
 
     }
 
@@ -84,17 +153,13 @@ Video::Video(
     std::string stream, 
     bool isReadFile) {
 
-
     //parse stream information
 
-    // set current stream
-
-    DecoderParameters params;
     Video::_getDecoderParams(
         0,      // video start
         false,  //headerOnly
-        // stream_type parsed from info above
-        -2,     // stream_id parsed from info above
+        get<0>(current_stream),
+        long(-2),     // stream_id parsed from info above
         true    // read all streams
     );
 
@@ -107,55 +172,57 @@ Video::Video(
     
 
     // get a decoder
-    SyncDecoder decoder;
     bool succeeded;
 
     cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
-    std::vector<StreamMetadata> videoStreams, audioStreams;
+    std::vector<double> videoFPS, audioFPS, ccFPS, subtitleFPS;
+
     std::vector<DecoderMetadata> metadata;
-    if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
+    succeeded = decoder.init(params, std::move(callback), &metadata);
+    if (succeeded) {
         for (const auto& header : metadata) {
             cout << "Decoding stream of" << header.format.type ;
         
             // generate streamMetadata object
-            StreamMetadata streamInfo;
+            // std::map<std::string, double> streamInfo;
             // parse stream timebase
-            torch::Tensor timeBase = torch::zeros({1}, torch::kFloat);
-            float * timeBaseData = timeBase.data_ptr<float>();
-            timeBaseData[0] = header.num / header.den;
-            streamInfo.timeBase = timeBase;
+            // streamInfo.insert({"timeBase", (double) (header.num / header.den)});
             // parse stream duration
-            torch::Tensor duration = torch::zeros({1}, torch::kFloat);
-            float* durationData = duration.data_ptr<float>();
-            durationData[0] = (float) header.duration;
             // to get duration in seconds multiply duration by timebase
-            streamInfo.duration = duration * streamInfo.timeBase;
-            
+            // streamInfo.insert({"duration", (double) header.duration * (double) (header.num / header.den)});
+                        
             if (header.format.type == TYPE_VIDEO) {
                 // parse stream fps
-                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
-                float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = header.fps;
-                streamInfo.frameRate = frameRate;
-                videoStreams.push_back(streamInfo);
+                double fps = double(header.fps);
+                videoFPS.push_back(fps);
             } else if (header.format.type == TYPE_AUDIO) {
-                const auto& format = header.format.format.audio;
-                // parse stream fps
-                torch::Tensor frameRate = torch::zeros({1}, torch::kFloat);
-                float* frameRateData = frameRate.data_ptr<float>();
-                frameRateData[0] = (float) format.samples; // this is user defined? 
-                streamInfo.frameRate = frameRate;
-                audioStreams.push_back(streamInfo);
+                // parse stream fps (user defined, doesn't seem cool)
+                double fps = double(0);
+                audioFPS.push_back(fps);
+            } else{
+                cout << "Got type" << header.format.type; 
             };
         }
-        VideoMetadata.insert({"video", videoStreams});
-        VideoMetadata.insert({"audio", audioStreams});
-    } 
+
+    } else{
+        audioFPS.push_back((-1.0));
+        videoFPS.push_back((-1.0));
+
+    }
+    streamMetadata.insert({"video", videoFPS});
+    streamMetadata.insert({"audio", audioFPS});
 } //video
 
-std::map<std::string, std::vector<StreamMetadata>> Video::getMetadata(){
-    return VideoMetadata;
+std::tuple<std::string, int64_t> Video::getCurrentStream() const {
+    return current_stream;
 }
 
+std::vector<double> Video::getFPS(std::string stream) const{
+    // add safety check
+    std::string stream_str = parse_type_to_string(stream);
+    return streamMetadata.at(stream_str);
+}
+
+
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index ac6fc6c3036..94d5c5a781b 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -6,6 +6,8 @@
 
 #include <string>
 #include <vector>
+#include <regex>
+#include <map>
 
 #include <ATen/ATen.h>
 #include <Python.h>
@@ -22,38 +24,35 @@
 using namespace ffmpeg;
 
 
-struct StreamMetadata{
-    torch::Tensor frameRate;  // average frame rate for the video (float)
-    torch::Tensor duration; // real world video duration in seconds (float)
-    // torch::Tensor startTime; // video start time in seconds (float)
-    torch::Tensor timeBase;
-    // do we need a constructor here?
-    explicit StreamMetadata(){
-        torch::Tensor frameRate = torch::zeros({0}, torch::kFloat);
-        torch::Tensor duration = torch::zeros({0}, torch::kFloat);
-        torch::Tensor timeBase = torch::zeros({0}, torch::kFloat); 
-    }
-};
-
 
 
 struct Video : torch::CustomClassHolder {
     // metadata is defined as a dictionary where every 
-    // type has a vector containing metadata for that stream
-    std::map<std::string, std::vector<StreamMetadata>> VideoMetadata;
-    
-    Video(std::string videoPath, std::string stream, bool isReadFile);
-    std::map<std::string, std::vector<StreamMetadata>> getMetadata();
+    // type value is a list of lists that contains tuple <char: "info", double: "value">
+    std::tuple<std::string, int64_t> current_stream;
+    std::map<std::string, std::vector<double>> streamMetadata;
+    public:
+        Video(std::string videoPath, std::string stream, bool isReadFile);
+        std::tuple<std::string, int64_t> getCurrentStream() const;
+        std::vector<double> getFPS(std::string stream) const;
+
+    private:
+        std::tuple<std::string, int64_t> _parseStream(const std::string& streamString);
+        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+
+    // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         // void Seek(double ts, std::string stream="", bool any_frame=False);
         // torch::List<torch::Tensor> Next(std::string stream="")
         // torch::List<torch::Tensor> Peak(std::string stream="")
-    // protected:
+    protected:
         // AV container type (check in decoder for exact type)
-    DecoderParameters params;
+        SyncDecoder decoder;
+        DecoderParameters params;
+
         // int64_t SecToStream(double ts); // TODO: add stream type
-        // float StreamToSec(int64_t pts); // TODO: add stream type
-    void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, int stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        // double StreamToSec(int64_t pts); // TODO: add stream type
+    
 }; // struct Video
 
 
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 357f4ccfe4c..bfa3d58cec7 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -8,7 +8,8 @@ namespace {
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
-        .def("get_metadata", &Video::getMetadata);
+        .def("get_current_stream", &Video::getCurrentStream)
+        .def("get_FPS", &Video::getFPS);
 
 } //namespace
 #endif

From e002dfb70b521db718ab8f38ab8bea04f4395f43 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 26 Aug 2020 04:34:51 -0500
Subject: [PATCH 058/128] API build next

---
 torchvision/csrc/cpu/video/Video.cpp    | 160 +++++++++++++++++++-----
 torchvision/csrc/cpu/video/Video.h      |  26 ++--
 torchvision/csrc/cpu/video/register.cpp |   5 +-
 3 files changed, 150 insertions(+), 41 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index b76e834e46e..077885c7788 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -79,7 +79,7 @@ MediaType parse_type_to_mt(const std::string& stream_string) {
       "Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::tuple<std::string, int64_t> Video::_parseStream(const std::string& streamString){
+std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
     TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
     static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
     std::smatch match;
@@ -140,11 +140,47 @@ void Video::_getDecoderParams(
         ccFormat.type = TYPE_CC;
         params.formats.insert(ccFormat);
 
+    } else{
+        // TODO: reset params.formats
+        std::set<MediaFormat> formats;
+        params.formats = formats;
+        MediaType stream_type = parse_type_to_mt(stream);
+        // now here is a mindfuck 
+        // - there is no way to construct mediaformat by type so we actually
+        // need an endless if/then
+        switch(stream_type) {
+            case TYPE_VIDEO:
+            {
+                MediaFormat videoFormat(0, (long) stream_id);
+                videoFormat.type = TYPE_VIDEO;
+                videoFormat.format.video.format = defaultVideoPixelFormat;
+                params.formats.insert(videoFormat);
+                break;
+            }
+            case TYPE_AUDIO:
+            {        
+                MediaFormat audioFormat((long) stream_id);
+                audioFormat.type = TYPE_AUDIO;
+                audioFormat.format.audio.format = defaultAudioSampleFormat;
+                params.formats.insert(audioFormat);
+                break;
+            }
+            // case TYPE_CC:
+            //     MediaFormat subtitleFormat(char('0'), long(stream_id));
+            //     subtitleFormat.type = TYPE_SUBTITLE;
+            //     params.formats.insert(subtitleFormat);
+            //     break;
+            default:
+            {
+                MediaFormat videoFormat(0, (long) -1);
+                videoFormat.type = TYPE_VIDEO;
+                videoFormat.format.video.format = defaultVideoPixelFormat;
+                params.formats.insert(videoFormat);
+                break;
+            }
+        }
     }
 
-
-    // else use the stream using the correct parsing technique
-
 } // _get decoder params
 
 
@@ -164,7 +200,7 @@ Video::Video(
     );
 
     std::string logMessage, logType;
-    DecoderInCallback callback = nullptr;
+    
     // TODO: add read from memory option
     params.uri = videoPath;
     logType = "file";
@@ -177,42 +213,50 @@ Video::Video(
     cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
           << "] has started";
     
-    std::vector<double> videoFPS, audioFPS, ccFPS, subtitleFPS;
 
-    std::vector<DecoderMetadata> metadata;
+    
+    std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
+    std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
+    std::vector<double> audioTB, videoTB, ccTB, subsTB;
+
+    // calback and metadata defined in struct
+    callback = nullptr;
     succeeded = decoder.init(params, std::move(callback), &metadata);
     if (succeeded) {
         for (const auto& header : metadata) {
-            cout << "Decoding stream of" << header.format.type ;
-        
-            // generate streamMetadata object
-            // std::map<std::string, double> streamInfo;
-            // parse stream timebase
-            // streamInfo.insert({"timeBase", (double) (header.num / header.den)});
-            // parse stream duration
-            // to get duration in seconds multiply duration by timebase
-            // streamInfo.insert({"duration", (double) header.duration * (double) (header.num / header.den)});
-                        
+            double fps = double(header.fps);
+            double timeBase = double(header.num) / double(header.den);
+            double duration = double(header.duration) * 1e-6; // * timeBase;
+
+
+            cout << "Decoding stream of" << header.format.type;
+            cout << "duration " << duration << " tb" << timeBase << " " << double(header.num) << " " <<double(header.num);
+
+
             if (header.format.type == TYPE_VIDEO) {
-                // parse stream fps
-                double fps = double(header.fps);
                 videoFPS.push_back(fps);
+                videoDuration.push_back(duration);
+                videoTB.push_back(timeBase);
             } else if (header.format.type == TYPE_AUDIO) {
-                // parse stream fps (user defined, doesn't seem cool)
-                double fps = double(0);
                 audioFPS.push_back(fps);
-            } else{
-                cout << "Got type" << header.format.type; 
+                audioDuration.push_back(duration);
+                audioTB.push_back(timeBase);
+            } else if (header.format.type == TYPE_CC){
+                ccFPS.push_back(fps);
+                ccDuration.push_back(duration);
+                ccTB.push_back(timeBase);
+            } else if (header.format.type == TYPE_SUBTITLE){
+                subsFPS.push_back(fps);
+                subsDuration.push_back(duration);
+                subsTB.push_back(timeBase);
             };
         }
 
-    } else{
-        audioFPS.push_back((-1.0));
-        videoFPS.push_back((-1.0));
-
     }
-    streamMetadata.insert({"video", videoFPS});
-    streamMetadata.insert({"audio", audioFPS});
+
+    streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
+    streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+    streamTimeBase.insert({{"video", videoTB}, {"audio", audioTB}});
 } //video
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
@@ -221,8 +265,62 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
 
 std::vector<double> Video::getFPS(std::string stream) const{
     // add safety check
-    std::string stream_str = parse_type_to_string(stream);
-    return streamMetadata.at(stream_str);
+    if (stream.empty()){
+        stream = get<0>(current_stream);
+    }
+    auto stream_tpl = _parseStream(stream);
+    std::string stream_str = get<0>(stream_tpl);
+    // check if the stream exists
+    return streamFPS.at(stream_str);
 }
 
+std::vector<double> Video::getDuration(std::string stream) const{
+    // add safety check
+    if (stream.empty()){
+        stream = get<0>(current_stream);
+    }
+    auto stream_tpl = _parseStream(stream);
+    std::string stream_str = get<0>(stream_tpl);
+    // check if the stream exists
+    return streamDuration.at(stream_str);
+}
+
+int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
+    if (stream.empty()){
+        stream = get<0>(current_stream);
+    }
+    auto stream_tpl = _parseStream(stream);
+    // check if the stream exists
+
+    // convert time to microseconds and cast to unsigned long int
+    int64_t ts_out = int64_t(ts * 1e6);
+
+    Video::_getDecoderParams(
+        ts_out,
+        0, // we're in full get frame mode
+        get<0>(stream_tpl),
+        get<1>(stream_tpl),
+        false);
+    
+    bool succeeded = decoder.init(params, std::move(callback), &metadata);
+    if (succeeded){
+        return 0;
+    }
+
+    return 1;
+
+}
+
+
+int64_t Video::Next(std::string stream=""){
+
+    DecoderOutputMessage out;
+    int64_t res = decoder.decode(&out, decoderTimeoutMs);
+
+    if (res == 0){
+        return 0;
+    }
+    
+    return 1;
+}
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 94d5c5a781b..74993382894 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -20,6 +20,7 @@
 #include "sync_decoder.h"
 #include "memory_buffer.h"
 #include "defs.h"
+#include "util.h"
 
 using namespace ffmpeg;
 
@@ -27,28 +28,35 @@ using namespace ffmpeg;
 
 
 struct Video : torch::CustomClassHolder {
-    // metadata is defined as a dictionary where every 
-    // type value is a list of lists that contains tuple <char: "info", double: "value">
+    bool any_frame=false; // add this to input parameters
     std::tuple<std::string, int64_t> current_stream;
-    std::map<std::string, std::vector<double>> streamMetadata;
+    std::map<std::string, std::vector<double>> streamFPS;
+    std::map<std::string, std::vector<double>> streamDuration;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
         std::tuple<std::string, int64_t> getCurrentStream() const;
-        std::vector<double> getFPS(std::string stream) const;
+        std::vector<double> getDuration(std::string stream="") const;
+        std::vector<double> getFPS(std::string stream="") const;
+        int64_t Seek(double ts, std::string stream, bool any_frame);
+        int64_t Next(std::string stream); //torch::List<torch::Tensor>
 
     private:
-        std::tuple<std::string, int64_t> _parseStream(const std::string& streamString);
         void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        std::map<std::string, std::vector<double>> streamTimeBase;
 
+        SyncDecoder decoder;
+        DecoderParameters params;
+
+        DecoderInCallback callback;
+        std::vector<DecoderMetadata> metadata;
     // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
-        // void Seek(double ts, std::string stream="", bool any_frame=False);
-        // torch::List<torch::Tensor> Next(std::string stream="")
+        
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:
         // AV container type (check in decoder for exact type)
-        SyncDecoder decoder;
-        DecoderParameters params;
+        
+        
 
         // int64_t SecToStream(double ts); // TODO: add stream type
         // double StreamToSec(int64_t pts); // TODO: add stream type
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index bfa3d58cec7..091052f4808 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -9,7 +9,10 @@ static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
         .def("get_current_stream", &Video::getCurrentStream)
-        .def("get_FPS", &Video::getFPS);
+        .def("duration", &Video::getDuration)
+        .def("fps", &Video::getFPS)
+        .def("seek", &Video::Seek)
+        .def("next", &Video::Next);
 
 } //namespace
 #endif

From 86c57efca0e7c748f79b0508c209ac97d305bffe Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 27 Aug 2020 04:15:41 -0500
Subject: [PATCH 059/128] test

---
 dev.py                               |  4 +++
 torchvision/csrc/cpu/video/Video.cpp | 38 +++++++++++++++-------------
 torchvision/csrc/cpu/video/Video.h   |  3 ++-
 3 files changed, 26 insertions(+), 19 deletions(-)
 create mode 100644 dev.py

diff --git a/dev.py b/dev.py
new file mode 100644
index 00000000000..9a66f867837
--- /dev/null
+++ b/dev.py
@@ -0,0 +1,4 @@
+import torch, torchvision
+video_path = "/home/bjuncek/work/video_reader_benchmark/videos/R6llTwEh07w.mp4"
+video = torch.classes.torchvision.Video(video_path, "video:0", True)
+video.next("video")
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 077885c7788..0d9898ac5e5 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -14,19 +14,19 @@ using namespace ffmpeg;
 
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension
-#ifdef _WIN32
-#if PY_MAJOR_VERSION < 3
-PyMODINIT_FUNC init_video_reader(void) {
-  // No need to do anything.
-  return NULL;
-}
-#else
-PyMODINIT_FUNC PyInit_video_reader(void) {
-  // No need to do anything.
-  return NULL;
-}
-#endif
-#endif
+// #ifdef _WIN32
+// #if PY_MAJOR_VERSION < 3
+// PyMODINIT_FUNC init_video_reader(void) {
+//   // No need to do anything.
+//   return NULL;
+// }
+// #else
+// PyMODINIT_FUNC PyInit_video_reader(void) {
+//   // No need to do anything.
+//   return NULL;
+// }
+// #endif
+// #endif
 
 
 
@@ -116,6 +116,7 @@ void Video::_getDecoderParams(
     params.headerOnly = getPtsOnly != 0;
     params.seekAccuracy = seekFrameMarginUs;
     params.startOffset = videoStartUs;
+    params.endOffset = std::numeric_limits<long>::infinity();
     params.timeoutMs = decoderTimeoutMs;
     params.preventStaleness = false;  // not sure what this is about
 
@@ -129,6 +130,10 @@ void Video::_getDecoderParams(
         MediaFormat videoFormat(0, (long) -2);
         videoFormat.type = TYPE_VIDEO;
         videoFormat.format.video.format = defaultVideoPixelFormat;
+        videoFormat.format.video.width = 0;
+        videoFormat.format.video.height = 0;
+        videoFormat.format.video.minDimension = 0;
+        videoFormat.format.video.maxDimension = 0;
         params.formats.insert(videoFormat);
 
         // there is no clear way on how to use other formats- todo later
@@ -207,11 +212,8 @@ Video::Video(
     logMessage = videoPath;
     
 
-    // get a decoder
-    bool succeeded;
-
     cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
-          << "] has started";
+          << "] has started \n";
     
 
     
@@ -220,7 +222,6 @@ Video::Video(
     std::vector<double> audioTB, videoTB, ccTB, subsTB;
 
     // calback and metadata defined in struct
-    callback = nullptr;
     succeeded = decoder.init(params, std::move(callback), &metadata);
     if (succeeded) {
         for (const auto& header : metadata) {
@@ -320,6 +321,7 @@ int64_t Video::Next(std::string stream=""){
     if (res == 0){
         return 0;
     }
+    out.payload.reset();
     
     return 1;
 }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 74993382894..5aec33389af 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -29,6 +29,7 @@ using namespace ffmpeg;
 
 struct Video : torch::CustomClassHolder {
     bool any_frame=false; // add this to input parameters
+    bool succeeded=false; // this is decoder init stuff
     std::tuple<std::string, int64_t> current_stream;
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
@@ -47,7 +48,7 @@ struct Video : torch::CustomClassHolder {
         SyncDecoder decoder;
         DecoderParameters params;
 
-        DecoderInCallback callback;
+        DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
     // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
         // std::map<std::string, std::vector<StreamMetadata>> getMetadata();

From 8d26b22ca5c6dd4ba4761ae8fb4ea76eba53811a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 27 Aug 2020 04:32:44 -0500
Subject: [PATCH 060/128] Merge change

---
 torchvision/csrc/cpu/video/Video.cpp | 21 +++++++++++++++++----
 torchvision/csrc/cpu/video/Video.h   |  4 ----
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 0d9898ac5e5..4b376c4449d 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -184,9 +184,22 @@ void Video::_getDecoderParams(
                 break;
             }
         }
+
     }
 
-} // _get decoder params
+
+
+        // there is no clear way on how to use other formats- todo later
+        // MediaFormat subtitleFormat("0", (long) -2);
+        // subtitleFormat.type = TYPE_SUBTITLE;
+        // MediaFormat ccFormat((double) 0, (long) -2);
+        // ccFormat.type = TYPE_CC;
+
+}
+
+    // else use the stream using the correct parsing technique
+
+// } // _get decoder params
 
 
 Video::Video(
@@ -198,9 +211,9 @@ Video::Video(
 
     Video::_getDecoderParams(
         0,      // video start
-        false,  //headerOnly
-        get<0>(current_stream),
-        long(-2),     // stream_id parsed from info above
+        0,  //headerOnly
+        get<0>(current_stream), // stream
+        long(-1),     // stream_id parsed from info above
         true    // read all streams
     );
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 5aec33389af..ffaff017af7 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -25,8 +25,6 @@
 using namespace ffmpeg;
 
 
-
-
 struct Video : torch::CustomClassHolder {
     bool any_frame=false; // add this to input parameters
     bool succeeded=false; // this is decoder init stuff
@@ -56,8 +54,6 @@ struct Video : torch::CustomClassHolder {
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:
         // AV container type (check in decoder for exact type)
-        
-        
 
         // int64_t SecToStream(double ts); // TODO: add stream type
         // double StreamToSec(int64_t pts); // TODO: add stream type

From b801654c20d32637035f86c372b325c8a0040cda Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 28 Aug 2020 05:31:24 -0500
Subject: [PATCH 061/128] formatting parameters to avoid the segfault

---
 torchvision/csrc/cpu/video/Video.cpp | 115 +++++++++------------------
 torchvision/csrc/cpu/video/Video.h   |   2 -
 2 files changed, 38 insertions(+), 79 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 4b376c4449d..861d2863ab6 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -31,7 +31,6 @@ using namespace ffmpeg;
 
 
 const size_t decoderTimeoutMs = 600000;
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
 // error, small value 100us won't be enough to select the next frame, but enough
@@ -113,93 +112,55 @@ void Video::_getDecoderParams(
         bool all_streams=false,
         double seekFrameMarginUs=10){
 
-    params.headerOnly = getPtsOnly != 0;
-    params.seekAccuracy = seekFrameMarginUs;
-    params.startOffset = videoStartUs;
-    params.endOffset = std::numeric_limits<long>::infinity();
+    
     params.timeoutMs = decoderTimeoutMs;
+    params.startOffset = videoStartUs;
+    params.seekAccuracy = 10;
+    params.headerOnly = false;
+
     params.preventStaleness = false;  // not sure what this is about
 
 
     if (all_streams == true){
-        MediaFormat audioFormat((long) -2);
-        audioFormat.type = TYPE_AUDIO;
-        audioFormat.format.audio.format = defaultAudioSampleFormat;
-        params.formats.insert(audioFormat);
-
-        MediaFormat videoFormat(0, (long) -2);
-        videoFormat.type = TYPE_VIDEO;
-        videoFormat.format.video.format = defaultVideoPixelFormat;
-        videoFormat.format.video.width = 0;
-        videoFormat.format.video.height = 0;
-        videoFormat.format.video.minDimension = 0;
-        videoFormat.format.video.maxDimension = 0;
-        params.formats.insert(videoFormat);
-
-        // there is no clear way on how to use other formats- todo later
-        MediaFormat subtitleFormat(char('0'), long(-2));
-        subtitleFormat.type = TYPE_SUBTITLE;
-        params.formats.insert(subtitleFormat);
-
-        MediaFormat ccFormat(double(0), long(-2));
-        ccFormat.type = TYPE_CC;
-        params.formats.insert(ccFormat);
-
+        MediaFormat format;
+        format.stream = -2;
+        format.type = TYPE_AUDIO;
+        params.formats.insert(format);
+
+        format.type = TYPE_VIDEO;
+        format.stream = -2;
+        format.format.video.width = 0;
+        format.format.video.height = 0;
+        format.format.video.cropImage = 0;
+        params.formats.insert(format);
+
+        format.type = TYPE_SUBTITLE;
+        format.stream = -2;
+        params.formats.insert(format);
+
+        format.type = TYPE_CC;
+        format.stream = -2;
+        params.formats.insert(format);
     } else{
+        // parse stream type
+        MediaType stream_type = parse_type_to_mt(stream);
+        
         // TODO: reset params.formats
         std::set<MediaFormat> formats;
         params.formats = formats;
-        MediaType stream_type = parse_type_to_mt(stream);
-        // now here is a mindfuck 
-        // - there is no way to construct mediaformat by type so we actually
-        // need an endless if/then
-        switch(stream_type) {
-            case TYPE_VIDEO:
-            {
-                MediaFormat videoFormat(0, (long) stream_id);
-                videoFormat.type = TYPE_VIDEO;
-                videoFormat.format.video.format = defaultVideoPixelFormat;
-                params.formats.insert(videoFormat);
-                break;
-            }
-            case TYPE_AUDIO:
-            {        
-                MediaFormat audioFormat((long) stream_id);
-                audioFormat.type = TYPE_AUDIO;
-                audioFormat.format.audio.format = defaultAudioSampleFormat;
-                params.formats.insert(audioFormat);
-                break;
-            }
-            // case TYPE_CC:
-            //     MediaFormat subtitleFormat(char('0'), long(stream_id));
-            //     subtitleFormat.type = TYPE_SUBTITLE;
-            //     params.formats.insert(subtitleFormat);
-            //     break;
-            default:
-            {
-                MediaFormat videoFormat(0, (long) -1);
-                videoFormat.type = TYPE_VIDEO;
-                videoFormat.format.video.format = defaultVideoPixelFormat;
-                params.formats.insert(videoFormat);
-                break;
-            }
+        // Define new format
+        MediaFormat format;
+        format.type = stream_type;
+        format.stream = stream_id;
+        if (stream_type == TYPE_VIDEO){
+            format.format.video.width = 0;
+            format.format.video.height = 0;
+            format.format.video.cropImage = 0;
         }
-
+        params.formats.insert(format);
     }
 
-
-
-        // there is no clear way on how to use other formats- todo later
-        // MediaFormat subtitleFormat("0", (long) -2);
-        // subtitleFormat.type = TYPE_SUBTITLE;
-        // MediaFormat ccFormat((double) 0, (long) -2);
-        // ccFormat.type = TYPE_CC;
-
-}
-
-    // else use the stream using the correct parsing technique
-
-// } // _get decoder params
+} // _get decoder params
 
 
 Video::Video(
@@ -207,6 +168,7 @@ Video::Video(
     std::string stream, 
     bool isReadFile) {
 
+
     //parse stream information
 
     Video::_getDecoderParams(
@@ -330,7 +292,6 @@ int64_t Video::Next(std::string stream=""){
 
     DecoderOutputMessage out;
     int64_t res = decoder.decode(&out, decoderTimeoutMs);
-
     if (res == 0){
         return 0;
     }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index ffaff017af7..9c9775609d8 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -48,8 +48,6 @@ struct Video : torch::CustomClassHolder {
 
         DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
-    // std::map<std::string, std::vector<std::map<std::string, double>>> getMetadata() const;
-        // std::map<std::string, std::vector<StreamMetadata>> getMetadata();
         
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:

From ec26d894bc7190481e5bc0605937670b4d2cec14 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Aug 2020 06:24:24 -0500
Subject: [PATCH 062/128] next now works on a video

---
 torchvision/csrc/cpu/video/Video.cpp | 76 +++++++++++++++++++++++-----
 torchvision/csrc/cpu/video/Video.h   | 10 +++-
 2 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 861d2863ab6..c4450f0965b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -38,6 +38,40 @@ const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 const size_t timeBaseJitterUs = 100;
 
 
+// returns number of written bytes
+template <typename T>
+size_t fillTensorList(DecoderOutputMessage& msgs,
+                      torch::Tensor& frame,
+                      torch::Tensor& framePts) {
+    // if (!msg) {
+    //     return 0;
+    // }
+    // set up PTS data
+    const auto& msg = msgs;
+
+    float* framePtsData = framePts.data_ptr<float>();
+    
+    float pts_s = float(float(msg.header.pts) * 1e-6);
+    framePtsData[0] =  pts_s;
+    
+    T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
+
+    
+    if (frameData) {
+        auto sizeInBytes = msg.payload->length();
+        memcpy(frameData, msg.payload->data(), sizeInBytes);
+    }
+  return sizeof(T);
+}
+
+size_t fillVideoTensor(
+    DecoderOutputMessage& msgs,
+    torch::Tensor& videoFrame,
+    torch::Tensor& videoFramePts) {
+  return fillTensorList<uint8_t>(msgs, videoFrame, videoFramePts);
+}
+
+
 std::string parse_type_to_string(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
@@ -195,6 +229,7 @@ Video::Video(
     std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
     std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
     std::vector<double> audioTB, videoTB, ccTB, subsTB;
+    
 
     // calback and metadata defined in struct
     succeeded = decoder.init(params, std::move(callback), &metadata);
@@ -210,21 +245,18 @@ Video::Video(
 
 
             if (header.format.type == TYPE_VIDEO) {
+                videoMetadata = header;
                 videoFPS.push_back(fps);
                 videoDuration.push_back(duration);
-                videoTB.push_back(timeBase);
             } else if (header.format.type == TYPE_AUDIO) {
                 audioFPS.push_back(fps);
                 audioDuration.push_back(duration);
-                audioTB.push_back(timeBase);
             } else if (header.format.type == TYPE_CC){
                 ccFPS.push_back(fps);
                 ccDuration.push_back(duration);
-                ccTB.push_back(timeBase);
             } else if (header.format.type == TYPE_SUBTITLE){
                 subsFPS.push_back(fps);
                 subsDuration.push_back(duration);
-                subsTB.push_back(timeBase);
             };
         }
 
@@ -232,7 +264,6 @@ Video::Video(
 
     streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
-    streamTimeBase.insert({{"video", videoTB}, {"audio", audioTB}});
 } //video
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
@@ -280,23 +311,40 @@ int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
     
     bool succeeded = decoder.init(params, std::move(callback), &metadata);
     if (succeeded){
+        // initialize the class variables and retrurn
+        video_any_frame = any_frame;
+        seekTS = ts; 
         return 0;
     }
-
     return 1;
-
 }
 
+torch::List<torch::Tensor> Video::Next(std::string stream=""){
 
-int64_t Video::Next(std::string stream=""){
+    size_t expectedWrittenBytes = 0;
+    torch::Tensor videoFramePts = torch::zeros({1}, torch::kFloat);
+
+    const auto& format = videoMetadata.format.format.video;
+    int outHeight = format.height;
+    int outWidth = format.width;
+    int numChannels = 3;
+    
+    torch::Tensor videoFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+    expectedWrittenBytes = outHeight * outWidth * numChannels;
+    std::cout << expectedWrittenBytes;
 
     DecoderOutputMessage out;
-    int64_t res = decoder.decode(&out, decoderTimeoutMs);
-    if (res == 0){
-        return 0;
+    // if not in seek mode or only looking at the keyframes, 
+    // return the immediate next frame 
+    if ((seekTS == -1) || (video_any_frame == false)) {
+        int64_t res = decoder.decode(&out, decoderTimeoutMs);
+        auto numberWrittenBytes = fillVideoTensor(out, videoFrame, videoFramePts);
+        out.payload.reset();
     }
-    out.payload.reset();
-    
-    return 1;
+
+    torch::List<torch::Tensor> result;
+    result.push_back(videoFrame);
+    result.push_back(videoFramePts);
+    return result;
 }
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 9c9775609d8..4e953106507 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -26,18 +26,23 @@ using namespace ffmpeg;
 
 
 struct Video : torch::CustomClassHolder {
-    bool any_frame=false; // add this to input parameters
+    bool video_any_frame=false; // add this to input parameters
     bool succeeded=false; // this is decoder init stuff
+    // this acts as a flag - if it's not set, next function simply
+    // retruns the next frame. If it's set, we look at the global seek
+    // time in comination with any_frame settings
+    double seekTS=-1; 
     std::tuple<std::string, int64_t> current_stream;
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
+    DecoderMetadata videoMetadata;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
         std::tuple<std::string, int64_t> getCurrentStream() const;
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
         int64_t Seek(double ts, std::string stream, bool any_frame);
-        int64_t Next(std::string stream); //torch::List<torch::Tensor>
+        torch::List<torch::Tensor> Next(std::string stream); //
 
     private:
         void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
@@ -49,6 +54,7 @@ struct Video : torch::CustomClassHolder {
         DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
         
+        
         // torch::List<torch::Tensor> Peak(std::string stream="")
     protected:
         // AV container type (check in decoder for exact type)

From 8f32236c61bde09413219f154e57a45b095d9159 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Aug 2020 13:02:58 -0500
Subject: [PATCH 063/128] make size of the output tensor format dependent

---
 torchvision/csrc/cpu/video/Video.cpp | 41 ++++++++++++++++------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index c4450f0965b..13e9ee3377b 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -209,7 +209,7 @@ Video::Video(
         0,      // video start
         0,  //headerOnly
         get<0>(current_stream), // stream
-        long(-1),     // stream_id parsed from info above
+        long(-1),     // stream_id parsed from info above change to -2
         true    // read all streams
     );
 
@@ -321,30 +321,37 @@ int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
 
 torch::List<torch::Tensor> Video::Next(std::string stream=""){
 
-    size_t expectedWrittenBytes = 0;
-    torch::Tensor videoFramePts = torch::zeros({1}, torch::kFloat);
-
-    const auto& format = videoMetadata.format.format.video;
-    int outHeight = format.height;
-    int outWidth = format.width;
-    int numChannels = 3;
-    
-    torch::Tensor videoFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-    expectedWrittenBytes = outHeight * outWidth * numChannels;
-    std::cout << expectedWrittenBytes;
 
+    // first decode the frame
     DecoderOutputMessage out;
+    int64_t res = decoder.decode(&out, decoderTimeoutMs);
+    auto header = out.header;
+    const auto& format = header.format;
+
+    // then initialize the output variables based on type
+    size_t expectedWrittenBytes = 0;
+    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+
+    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
+    if (format.type == TYPE_VIDEO) {
+        int outHeight = format.format.video.height;
+        int outWidth = format.format.video.width;
+        int numChannels = 3;
+        outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+        expectedWrittenBytes = outHeight * outWidth * numChannels;
+        std::cout << expectedWrittenBytes;
+    }
+    
     // if not in seek mode or only looking at the keyframes, 
     // return the immediate next frame 
-    if ((seekTS == -1) || (video_any_frame == false)) {
-        int64_t res = decoder.decode(&out, decoderTimeoutMs);
-        auto numberWrittenBytes = fillVideoTensor(out, videoFrame, videoFramePts);
+    if ((seekTS == -1) || (video_any_frame == false)) {            
+        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
         out.payload.reset();
     }
 
     torch::List<torch::Tensor> result;
-    result.push_back(videoFrame);
-    result.push_back(videoFramePts);
+    result.push_back(outFrame);
+    result.push_back(framePTS);
     return result;
 }
 

From d0b3ee10d07874afa4278c49874c519f7b76616e Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Aug 2020 13:52:57 -0500
Subject: [PATCH 064/128] Make next work on audio stream only as well

---
 torchvision/csrc/cpu/video/Video.cpp | 45 +++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 13e9ee3377b..3b525ecba67 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -71,6 +71,13 @@ size_t fillVideoTensor(
   return fillTensorList<uint8_t>(msgs, videoFrame, videoFramePts);
 }
 
+size_t fillAudioTensor(
+    DecoderOutputMessage& msgs,
+    torch::Tensor& audioFrame,
+    torch::Tensor& audioFramePts) {
+  return fillTensorList<float>(msgs, audioFrame, audioFramePts);
+}
+
 
 std::string parse_type_to_string(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
@@ -205,6 +212,8 @@ Video::Video(
 
     //parse stream information
 
+    current_stream = _parseStream(stream);
+    // note that in the initial call we want to get all streams
     Video::_getDecoderParams(
         0,      // video start
         0,  //headerOnly
@@ -220,11 +229,6 @@ Video::Video(
     logType = "file";
     logMessage = videoPath;
     
-
-    cout << "Video decoding to gather metadata from " << logType << " [" << logMessage
-          << "] has started \n";
-    
-
     
     std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
     std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
@@ -243,7 +247,6 @@ Video::Video(
             cout << "Decoding stream of" << header.format.type;
             cout << "duration " << duration << " tb" << timeBase << " " << double(header.num) << " " <<double(header.num);
 
-
             if (header.format.type == TYPE_VIDEO) {
                 videoMetadata = header;
                 videoFPS.push_back(fps);
@@ -261,9 +264,20 @@ Video::Video(
         }
 
     }
-
     streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+
+
+    // set current stream again
+    Video::_getDecoderParams(
+        0,      // video start
+        0,  //headerOnly
+        get<0>(current_stream), // stream
+        long(-1),     // stream_id parsed from info above change to -2
+        false    // read all streams
+    );
+    // calback and metadata defined in Video.h
+    succeeded = decoder.init(params, std::move(callback), &metadata);
 } //video
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
@@ -340,12 +354,27 @@ torch::List<torch::Tensor> Video::Next(std::string stream=""){
         outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
         expectedWrittenBytes = outHeight * outWidth * numChannels;
         std::cout << expectedWrittenBytes;
+    } else if (format.type == TYPE_AUDIO) {
+        int outAudioChannels = format.format.audio.channels;
+        int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
+        int frameSizeTotal = out.payload->length();
+        
+        CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+        int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+
+        outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+
+        expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
     
     // if not in seek mode or only looking at the keyframes, 
     // return the immediate next frame 
     if ((seekTS == -1) || (video_any_frame == false)) {            
-        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+        if (format.type == TYPE_VIDEO) {
+            auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+        } else {
+            auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+        }
         out.payload.reset();
     }
 

From be00ba7447b067d99b0e1c6869bb9fc0442cda7a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 2 Sep 2020 05:56:09 -0500
Subject: [PATCH 065/128] refactoring the _setCurrentStream param

---
 torchvision/csrc/cpu/video/Video.cpp | 67 +++++++++++++++-------------
 torchvision/csrc/cpu/video/Video.h   |  3 +-
 2 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 3b525ecba67..8fa5361c675 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -145,7 +145,7 @@ std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
 
 
 void Video::_getDecoderParams(
-        int64_t videoStartUs,
+        int64_t videoStartS,
         int64_t getPtsOnly,
 
         std::string stream,
@@ -154,6 +154,8 @@ void Video::_getDecoderParams(
         double seekFrameMarginUs=10){
 
     
+    int64_t videoStartUs = int64_t(videoStartS * 1e6);
+
     params.timeoutMs = decoderTimeoutMs;
     params.startOffset = videoStartUs;
     params.seekAccuracy = 10;
@@ -217,7 +219,7 @@ Video::Video(
     Video::_getDecoderParams(
         0,      // video start
         0,  //headerOnly
-        get<0>(current_stream), // stream
+        get<0>(current_stream), // stream info - remove that
         long(-1),     // stream_id parsed from info above change to -2
         true    // read all streams
     );
@@ -243,10 +245,6 @@ Video::Video(
             double timeBase = double(header.num) / double(header.den);
             double duration = double(header.duration) * 1e-6; // * timeBase;
 
-
-            cout << "Decoding stream of" << header.format.type;
-            cout << "duration " << duration << " tb" << timeBase << " " << double(header.num) << " " <<double(header.num);
-
             if (header.format.type == TYPE_VIDEO) {
                 videoMetadata = header;
                 videoFPS.push_back(fps);
@@ -268,17 +266,40 @@ Video::Video(
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
 
 
-    // set current stream again
-    Video::_getDecoderParams(
-        0,      // video start
+    // // set current stream again
+    // Video::_getDecoderParams(
+    //     0,      // video start
+    //     0,  //headerOnly
+    //     get<0>(current_stream), // stream
+    //     long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
+    //     false    // read all streams
+    // );
+
+    // succeeded = decoder.init(params, std::move(callback), &metadata);
+    succeeded = Video::_setCurrentStream(stream);
+    std::cout << "\nDecoder inited with: " << succeeded;
+} //video
+
+// why is this not woriking? 
+bool Video::_setCurrentStream(std::string stream){  
+    current_stream = _parseStream(stream);
+    double ts = 0;
+    if (seekTS > 0) {
+        ts = seekTS;
+    }
+
+    _getDecoderParams(
+        ts,  // video start
         0,  //headerOnly
         get<0>(current_stream), // stream
-        long(-1),     // stream_id parsed from info above change to -2
+        long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
         false    // read all streams
     );
+
     // calback and metadata defined in Video.h
-    succeeded = decoder.init(params, std::move(callback), &metadata);
-} //video
+    return(decoder.init(params, std::move(callback), &metadata));
+
+}
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
     return current_stream;
@@ -307,27 +328,13 @@ std::vector<double> Video::getDuration(std::string stream) const{
 }
 
 int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
-    if (stream.empty()){
-        stream = get<0>(current_stream);
-    }
-    auto stream_tpl = _parseStream(stream);
-    // check if the stream exists
 
-    // convert time to microseconds and cast to unsigned long int
-    int64_t ts_out = int64_t(ts * 1e6);
+    // initialize the class variables and retrurn
+    video_any_frame = any_frame;
+    seekTS = ts; 
 
-    Video::_getDecoderParams(
-        ts_out,
-        0, // we're in full get frame mode
-        get<0>(stream_tpl),
-        get<1>(stream_tpl),
-        false);
-    
-    bool succeeded = decoder.init(params, std::move(callback), &metadata);
+    succeeded = Video::_setCurrentStream(stream);
     if (succeeded){
-        // initialize the class variables and retrurn
-        video_any_frame = any_frame;
-        seekTS = ts; 
         return 0;
     }
     return 1;
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 4e953106507..a03bd88d6cd 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -45,7 +45,8 @@ struct Video : torch::CustomClassHolder {
         torch::List<torch::Tensor> Next(std::string stream); //
 
     private:
-        void _getDecoderParams(int64_t videoStartUs, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        bool _setCurrentStream(std::string stream="video");
         std::map<std::string, std::vector<double>> streamTimeBase;
 
         SyncDecoder decoder;

From e79b4fb3fe186ecb255ae4396f3a8e161c52e42c Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 3 Sep 2020 05:36:46 -0500
Subject: [PATCH 066/128] Fixing the last frame return and sensor

---
 torchvision/csrc/cpu/video/Video.cpp | 107 ++++++++++++++++-----------
 video_reader.todo                    |  30 ++++----
 2 files changed, 77 insertions(+), 60 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 8fa5361c675..ced2ff1f99c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -119,7 +119,7 @@ MediaType parse_type_to_mt(const std::string& stream_string) {
       "Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
+std::tuple<std::string, long> _parseStream(const std::string& streamString){
     TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
     static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
     std::smatch match;
@@ -130,7 +130,7 @@ std::tuple<std::string, int64_t> _parseStream(const std::string& streamString){
     
     std::string type_ = "video";
     type_ = parse_type_to_string(match[1].str());
-    int64_t index_ = -1;
+    long index_ = -1;
     if (match[2].matched) {
         try {
         index_ = c10::stoi(match[2].str());
@@ -265,19 +265,12 @@ Video::Video(
     streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
     streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
 
-
-    // // set current stream again
-    // Video::_getDecoderParams(
-    //     0,      // video start
-    //     0,  //headerOnly
-    //     get<0>(current_stream), // stream
-    //     long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
-    //     false    // read all streams
-    // );
-
-    // succeeded = decoder.init(params, std::move(callback), &metadata);
     succeeded = Video::_setCurrentStream(stream);
-    std::cout << "\nDecoder inited with: " << succeeded;
+    LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
+    if (long(get<1>(current_stream)) != -1) {
+        LOG(INFO) << "Stream index set to " << long(get<1>(current_stream) <<
+        ". If you encounter trouble, consider switching it to automatic stream discovery.\n";
+    }
 } //video
 
 // why is this not woriking? 
@@ -342,47 +335,71 @@ int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
 
 torch::List<torch::Tensor> Video::Next(std::string stream=""){
 
+    bool switched = false;
+    if ((!stream.empty()) && (_parseStream(stream) != current_stream)){
+        succeeded = Video::_setCurrentStream(stream);
+        if (succeeded){
+            cout << "Switching the stream to new one in next ya'll \n";
+            switched = true;
+        }
+    }
+
+    // if failing to decode simply return 0 (note, maybe 
+    // raise an exeption otherwise)
+    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
     // first decode the frame
     DecoderOutputMessage out;
     int64_t res = decoder.decode(&out, decoderTimeoutMs);
-    auto header = out.header;
-    const auto& format = header.format;
+    if (res == 0){
 
-    // then initialize the output variables based on type
-    size_t expectedWrittenBytes = 0;
-    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+        auto header = out.header;
+        const auto& format = header.format;
 
-    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-    if (format.type == TYPE_VIDEO) {
-        int outHeight = format.format.video.height;
-        int outWidth = format.format.video.width;
-        int numChannels = 3;
-        outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-        expectedWrittenBytes = outHeight * outWidth * numChannels;
-        std::cout << expectedWrittenBytes;
-    } else if (format.type == TYPE_AUDIO) {
-        int outAudioChannels = format.format.audio.channels;
-        int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
-        int frameSizeTotal = out.payload->length();
-        
-        CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-        int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+        if (switched == true) {
+            cout << "now looking at " << format.type <<" \n";
+        }
 
-        outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+        // then initialize the output variables based on type
+        size_t expectedWrittenBytes = 0;
 
-        expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
-    }
-    
-    // if not in seek mode or only looking at the keyframes, 
-    // return the immediate next frame 
-    if ((seekTS == -1) || (video_any_frame == false)) {            
         if (format.type == TYPE_VIDEO) {
-            auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-        } else {
-            auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+            int outHeight = format.format.video.height;
+            int outWidth = format.format.video.width;
+            int numChannels = 3;
+            outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+            expectedWrittenBytes = outHeight * outWidth * numChannels;
+            std::cout << expectedWrittenBytes;
+        } else if (format.type == TYPE_AUDIO) {
+            int outAudioChannels = format.format.audio.channels;
+            int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
+            int frameSizeTotal = out.payload->length();
+            
+            CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+            int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+
+            outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+
+            expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
+        }
+        
+        std::cout << "Successfully allocated tensors to the dimension \n" ;
+        // if not in seek mode or only looking at the keyframes, 
+        // return the immediate next frame 
+        if ((seekTS == -1) || (video_any_frame == false)) {   
+
+            std::cout << "In non-seek mode stuff is happening \n";         
+            if (format.type == TYPE_VIDEO) {
+                auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+            } else {
+                auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+            }
+            out.payload.reset();
         }
-        out.payload.reset();
+    }
+    else {
+        LOG(ERROR) << "Decoder run into a last iteration or has failed";
     }
 
     torch::List<torch::Tensor> result;
diff --git a/video_reader.todo b/video_reader.todo
index 2a01bbde4b8..f51dab1c8ef 100644
--- a/video_reader.todo
+++ b/video_reader.todo
@@ -1,15 +1,15 @@
-The new API:
-    ☐ the c++ extension is going to live in torchvision/csrc/cpu/video
-    ☐ modification of the build needs to go to setup.py
-    ☐ torchvision/io/_video something needs to happen somehow
-
-Tests changes:
-    ☐ test/test_io.py
-    ☐ test/test_video_reader.py (change to test video api)
-
-
-
-Implementation:
-    ☐ Datatype for strem
-    ☐ Datatype for container
-    ☐ Do I use tensor as a type in metadata 
\ No newline at end of file
+Documented edgecases that don't work:
+    ☐ seeking with anyframe=True
+    ✔ last frame segfaults - exit cleanly @started(20-09-02 10:44) @done(20-09-02 11:07) @lasted(23m1s)
+    ✔ switching modalities in the subsequent calls to next() @done(20-09-03 05:33)
+    ```
+    video.next("video:0")
+    video.next("video:0")
+    video.next("audio:0")
+    ```
+
+
+Random todo's:
+    ✔ add check for the current stream @done(20-09-02 06:37)
+    ☐ ensure warning if stream is out of bounds
+    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
\ No newline at end of file

From 8f031ad854088d1c9c97d6147707eed63b31a9d0 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 3 Sep 2020 06:01:55 -0500
Subject: [PATCH 067/128] todo docs

---
 video_reader.todo | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/video_reader.todo b/video_reader.todo
index f51dab1c8ef..cc9bc343f23 100644
--- a/video_reader.todo
+++ b/video_reader.todo
@@ -12,4 +12,8 @@ Documented edgecases that don't work:
 Random todo's:
     ✔ add check for the current stream @done(20-09-02 06:37)
     ☐ ensure warning if stream is out of bounds
-    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
\ No newline at end of file
+    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
+    ☐ can we make this an iterable?
+    ☐ destructors
+    ☐ adding tests to test.py
+    ☐ thorough checking for memory leaks
\ No newline at end of file

From 7d0e5f66b628f0c0b7bbe94c43f4494e755d00e7 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 8 Sep 2020 06:12:43 -0500
Subject: [PATCH 068/128] Formatting

---
 torchvision/csrc/cpu/video/Video.cpp | 572 +++++++++++++--------------
 torchvision/csrc/cpu/video/Video.h   |  12 +-
 2 files changed, 285 insertions(+), 299 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index ced2ff1f99c..d711325a498 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -1,17 +1,14 @@
 
-# include "Video.h"
-#include <torch/script.h>
+#include "Video.h"
 #include <c10/util/Logging.h>
-#include "sync_decoder.h"
-#include "sync_decoder.h"
-#include "memory_buffer.h"
+#include <torch/script.h>
 #include "defs.h"
-
+#include "memory_buffer.h"
+#include "sync_decoder.h"
 
 using namespace std;
 using namespace ffmpeg;
 
-
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension
 // #ifdef _WIN32
@@ -28,8 +25,6 @@ using namespace ffmpeg;
 // #endif
 // #endif
 
-
-
 const size_t decoderTimeoutMs = 600000;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
@@ -37,30 +32,26 @@ const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // to compensate rounding error due to the multiple conversions.
 const size_t timeBaseJitterUs = 100;
 
-
 // returns number of written bytes
 template <typename T>
-size_t fillTensorList(DecoderOutputMessage& msgs,
-                      torch::Tensor& frame,
-                      torch::Tensor& framePts) {
-    // if (!msg) {
-    //     return 0;
-    // }
-    // set up PTS data
-    const auto& msg = msgs;
-
-    float* framePtsData = framePts.data_ptr<float>();
-    
-    float pts_s = float(float(msg.header.pts) * 1e-6);
-    framePtsData[0] =  pts_s;
-    
-    T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-
-    
-    if (frameData) {
-        auto sizeInBytes = msg.payload->length();
-        memcpy(frameData, msg.payload->data(), sizeInBytes);
-    }
+size_t fillTensorList(
+    DecoderOutputMessage& msgs,
+    torch::Tensor& frame,
+    torch::Tensor& framePts) {
+  // set up PTS data
+  const auto& msg = msgs;
+
+  float* framePtsData = framePts.data_ptr<float>();
+
+  float pts_s = float(float(msg.header.pts) * 1e-6);
+  framePtsData[0] = pts_s;
+
+  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
+
+  if (frameData) {
+    auto sizeInBytes = msg.payload->length();
+    memcpy(frameData, msg.payload->data(), sizeInBytes);
+  }
   return sizeof(T);
 }
 
@@ -78,7 +69,6 @@ size_t fillAudioTensor(
   return fillTensorList<float>(msgs, audioFrame, audioFramePts);
 }
 
-
 std::string parse_type_to_string(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
@@ -95,8 +85,7 @@ std::string parse_type_to_string(const std::string& stream_string) {
   if (device != types.end()) {
     return device->first;
   }
-  AT_ERROR(
-      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+  AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
 MediaType parse_type_to_mt(const std::string& stream_string) {
@@ -115,296 +104,293 @@ MediaType parse_type_to_mt(const std::string& stream_string) {
   if (device != types.end()) {
     return device->second;
   }
-  AT_ERROR(
-      "Expected one of [audio, video, subtitle, cc] ", stream_string);
+  AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::tuple<std::string, long> _parseStream(const std::string& streamString){
-    TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
-    static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
-    std::smatch match;
-
-    TORCH_CHECK(
-        std::regex_match(streamString, match, regex),
-        "Invalid stream string: '", streamString, "'");
-    
-    std::string type_ = "video";
-    type_ = parse_type_to_string(match[1].str());
-    long index_ = -1;
-    if (match[2].matched) {
-        try {
-        index_ = c10::stoi(match[2].str());
-        } catch (const std::exception &) {
-        AT_ERROR(
-            "Could not parse device index '", match[2].str(),
-            "' in device string '", streamString, "'");
-        }
+std::tuple<std::string, long> _parseStream(const std::string& streamString) {
+  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
+  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
+  std::smatch match;
+
+  TORCH_CHECK(
+      std::regex_match(streamString, match, regex),
+      "Invalid stream string: '",
+      streamString,
+      "'");
+
+  std::string type_ = "video";
+  type_ = parse_type_to_string(match[1].str());
+  long index_ = -1;
+  if (match[2].matched) {
+    try {
+      index_ = c10::stoi(match[2].str());
+    } catch (const std::exception&) {
+      AT_ERROR(
+          "Could not parse device index '",
+          match[2].str(),
+          "' in device string '",
+          streamString,
+          "'");
     }
-    return std::make_tuple(type_, index_);
+  }
+  return std::make_tuple(type_, index_);
 }
 
-
 void Video::_getDecoderParams(
-        int64_t videoStartS,
-        int64_t getPtsOnly,
-
-        std::string stream,
-        long stream_id=-1,
-        bool all_streams=false,
-        double seekFrameMarginUs=10){
-
-    
-    int64_t videoStartUs = int64_t(videoStartS * 1e6);
-
-    params.timeoutMs = decoderTimeoutMs;
-    params.startOffset = videoStartUs;
-    params.seekAccuracy = 10;
-    params.headerOnly = false;
-
-    params.preventStaleness = false;  // not sure what this is about
-
-
-    if (all_streams == true){
-        MediaFormat format;
-        format.stream = -2;
-        format.type = TYPE_AUDIO;
-        params.formats.insert(format);
-
-        format.type = TYPE_VIDEO;
-        format.stream = -2;
-        format.format.video.width = 0;
-        format.format.video.height = 0;
-        format.format.video.cropImage = 0;
-        params.formats.insert(format);
-
-        format.type = TYPE_SUBTITLE;
-        format.stream = -2;
-        params.formats.insert(format);
-
-        format.type = TYPE_CC;
-        format.stream = -2;
-        params.formats.insert(format);
-    } else{
-        // parse stream type
-        MediaType stream_type = parse_type_to_mt(stream);
-        
-        // TODO: reset params.formats
-        std::set<MediaFormat> formats;
-        params.formats = formats;
-        // Define new format
-        MediaFormat format;
-        format.type = stream_type;
-        format.stream = stream_id;
-        if (stream_type == TYPE_VIDEO){
-            format.format.video.width = 0;
-            format.format.video.height = 0;
-            format.format.video.cropImage = 0;
-        }
-        params.formats.insert(format);
+
+    int64_t videoStartS,
+    int64_t getPtsOnly,
+    std::string stream,
+    long stream_id = -1,
+    bool all_streams = false,
+    double seekFrameMarginUs = 10) {
+  int64_t videoStartUs = int64_t(videoStartS * 1e6);
+
+  params.timeoutMs = decoderTimeoutMs;
+  params.startOffset = videoStartUs;
+  params.seekAccuracy = 10;
+  params.headerOnly = false;
+
+  params.preventStaleness = false; // not sure what this is about
+
+  if (all_streams == true) {
+    MediaFormat format;
+    format.stream = -2;
+    format.type = TYPE_AUDIO;
+    params.formats.insert(format);
+
+    format.type = TYPE_VIDEO;
+    format.stream = -2;
+    format.format.video.width = 0;
+    format.format.video.height = 0;
+    format.format.video.cropImage = 0;
+    params.formats.insert(format);
+
+    format.type = TYPE_SUBTITLE;
+    format.stream = -2;
+    params.formats.insert(format);
+
+    format.type = TYPE_CC;
+    format.stream = -2;
+    params.formats.insert(format);
+  } else {
+    // parse stream type
+    MediaType stream_type = parse_type_to_mt(stream);
+
+    // TODO: reset params.formats
+    std::set<MediaFormat> formats;
+    params.formats = formats;
+    // Define new format
+    MediaFormat format;
+    format.type = stream_type;
+    format.stream = stream_id;
+    if (stream_type == TYPE_VIDEO) {
+      format.format.video.width = 0;
+      format.format.video.height = 0;
+      format.format.video.cropImage = 0;
     }
+    params.formats.insert(format);
+  }
 
 } // _get decoder params
 
 
-Video::Video(
-    std::string videoPath, 
-    std::string stream, 
-    bool isReadFile) {
-
-
-    //parse stream information
-
-    current_stream = _parseStream(stream);
-    // note that in the initial call we want to get all streams
-    Video::_getDecoderParams(
-        0,      // video start
-        0,  //headerOnly
-        get<0>(current_stream), // stream info - remove that
-        long(-1),     // stream_id parsed from info above change to -2
-        true    // read all streams
-    );
-
-    std::string logMessage, logType;
-    
-    // TODO: add read from memory option
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-    
-    
-    std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
-    std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
-    std::vector<double> audioTB, videoTB, ccTB, subsTB;
-    
-
-    // calback and metadata defined in struct
-    succeeded = decoder.init(params, std::move(callback), &metadata);
-    if (succeeded) {
-        for (const auto& header : metadata) {
-            double fps = double(header.fps);
-            double timeBase = double(header.num) / double(header.den);
-            double duration = double(header.duration) * 1e-6; // * timeBase;
-
-            if (header.format.type == TYPE_VIDEO) {
-                videoMetadata = header;
-                videoFPS.push_back(fps);
-                videoDuration.push_back(duration);
-            } else if (header.format.type == TYPE_AUDIO) {
-                audioFPS.push_back(fps);
-                audioDuration.push_back(duration);
-            } else if (header.format.type == TYPE_CC){
-                ccFPS.push_back(fps);
-                ccDuration.push_back(duration);
-            } else if (header.format.type == TYPE_SUBTITLE){
-                subsFPS.push_back(fps);
-                subsDuration.push_back(duration);
-            };
-        }
-
+Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
+  // parse stream information
+  current_stream = _parseStream(stream);
+  // note that in the initial call we want to get all streams
+  Video::_getDecoderParams(
+      0, // video start
+      0, // headerOnly
+      get<0>(current_stream), // stream info - remove that
+      long(-1), // stream_id parsed from info above change to -2
+      true // read all streams
+  );
+
+  std::string logMessage, logType;
+
+  // TODO: add read from memory option
+  params.uri = videoPath;
+  logType = "file";
+  logMessage = videoPath;
+
+  std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
+  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
+  std::vector<double> audioTB, videoTB, ccTB, subsTB;
+
+  // calback and metadata defined in struct
+  succeeded = decoder.init(params, std::move(callback), &metadata);
+  if (succeeded) {
+    for (const auto& header : metadata) {
+      double fps = double(header.fps);
+      double timeBase = double(header.num) / double(header.den);
+      double duration = double(header.duration) * 1e-6; // * timeBase;
+
+      if (header.format.type == TYPE_VIDEO) {
+        videoMetadata = header;
+        videoFPS.push_back(fps);
+        videoDuration.push_back(duration);
+      } else if (header.format.type == TYPE_AUDIO) {
+        audioFPS.push_back(fps);
+        audioDuration.push_back(duration);
+      } else if (header.format.type == TYPE_CC) {
+        ccFPS.push_back(fps);
+        ccDuration.push_back(duration);
+      } else if (header.format.type == TYPE_SUBTITLE) {
+        subsFPS.push_back(fps);
+        subsDuration.push_back(duration);
+      };
     }
-    streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
-    streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
-
-    succeeded = Video::_setCurrentStream(stream);
-    LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
-    if (long(get<1>(current_stream)) != -1) {
-        LOG(INFO) << "Stream index set to " << long(get<1>(current_stream) <<
-        ". If you encounter trouble, consider switching it to automatic stream discovery.\n";
-    }
-} //video
-
-// why is this not woriking? 
-bool Video::_setCurrentStream(std::string stream){  
-    current_stream = _parseStream(stream);
-    double ts = 0;
-    if (seekTS > 0) {
-        ts = seekTS;
-    }
-
-    _getDecoderParams(
-        ts,  // video start
-        0,  //headerOnly
-        get<0>(current_stream), // stream
-        long(get<1>(current_stream)),     // stream_id parsed from info above change to -2
-        false    // read all streams
-    );
+  }
+  streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
+  streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+
+  succeeded = Video::_setCurrentStream();
+  LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
+  if (get<1>(current_stream) != -1) {
+    LOG(INFO)
+        << "Stream index set to " << get<1>(current_stream)
+        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
+  }
+} // video
 
-    // calback and metadata defined in Video.h
-    return(decoder.init(params, std::move(callback), &metadata));
+bool Video::_setCurrentStream() {
+  double ts = 0;
+  if (seekTS > 0) {
+    ts = seekTS;
+  }
 
+  _getDecoderParams(
+      ts, // video start
+      0, // headerOnly
+      get<0>(current_stream), // stream
+      long(get<1>(
+          current_stream)), // stream_id parsed from info above change to -2
+      false // read all streams
+  );
+
+  // calback and metadata defined in Video.h
+  return (decoder.init(params, std::move(callback), &metadata));
 }
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
-    return current_stream;
+  return current_stream;
 }
 
-std::vector<double> Video::getFPS(std::string stream) const{
-    // add safety check
-    if (stream.empty()){
-        stream = get<0>(current_stream);
-    }
-    auto stream_tpl = _parseStream(stream);
-    std::string stream_str = get<0>(stream_tpl);
-    // check if the stream exists
-    return streamFPS.at(stream_str);
+std::vector<double> Video::getFPS(std::string stream) const {
+  // add safety check
+  if (stream.empty()) {
+    stream = get<0>(current_stream);
+  }
+  auto stream_tpl = _parseStream(stream);
+  std::string stream_str = get<0>(stream_tpl);
+  // check if the stream exists
+  return streamFPS.at(stream_str);
 }
 
-std::vector<double> Video::getDuration(std::string stream) const{
-    // add safety check
-    if (stream.empty()){
-        stream = get<0>(current_stream);
-    }
-    auto stream_tpl = _parseStream(stream);
-    std::string stream_str = get<0>(stream_tpl);
-    // check if the stream exists
-    return streamDuration.at(stream_str);
+std::vector<double> Video::getDuration(std::string stream) const {
+  // add safety check
+  if (stream.empty()) {
+    stream = get<0>(current_stream);
+  }
+  auto stream_tpl = _parseStream(stream);
+  std::string stream_str = get<0>(stream_tpl);
+  // check if the stream exists
+  return streamDuration.at(stream_str);
 }
 
-int64_t Video::Seek(double ts, std::string stream="", bool any_frame=false){
+void Video::Seek(double ts, bool any_frame = false) {
+  // initialize the class variables used for seeking and retrurn
+  video_any_frame = any_frame;
+  seekTS = ts;
+  doSeek = true;
+}
 
-    // initialize the class variables and retrurn
-    video_any_frame = any_frame;
-    seekTS = ts; 
+torch::List<torch::Tensor> Video::Next(std::string stream) {
 
-    succeeded = Video::_setCurrentStream(stream);
-    if (succeeded){
-        return 0;
-    }
-    return 1;
-}
+  bool newInit = false;
+  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
+      current_stream = _parseStream(stream);
+      newInit = true;
+  }
 
-torch::List<torch::Tensor> Video::Next(std::string stream=""){
+  if ((seekTS != -1) && (doSeek == true)) {
+      newInit = true;
+      doSeek = false;
+  }
 
-    bool switched = false;
-    if ((!stream.empty()) && (_parseStream(stream) != current_stream)){
-        succeeded = Video::_setCurrentStream(stream);
-        if (succeeded){
-            cout << "Switching the stream to new one in next ya'll \n";
-            switched = true;
-        }
+  if (newInit){
+    succeeded = Video::_setCurrentStream();
+    if (succeeded) {
+      newInit = false;
+      // cout << "Reinitializing the decoder again \n";
     }
+  }
 
-    // if failing to decode simply return 0 (note, maybe 
-    // raise an exeption otherwise)
-    torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
-    torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-
-    // first decode the frame
-    DecoderOutputMessage out;
-    int64_t res = decoder.decode(&out, decoderTimeoutMs);
-    if (res == 0){
-
-        auto header = out.header;
-        const auto& format = header.format;
-
-        if (switched == true) {
-            cout << "now looking at " << format.type <<" \n";
-        }
-
-        // then initialize the output variables based on type
-        size_t expectedWrittenBytes = 0;
-
-        if (format.type == TYPE_VIDEO) {
-            int outHeight = format.format.video.height;
-            int outWidth = format.format.video.width;
-            int numChannels = 3;
-            outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-            expectedWrittenBytes = outHeight * outWidth * numChannels;
-            std::cout << expectedWrittenBytes;
-        } else if (format.type == TYPE_AUDIO) {
-            int outAudioChannels = format.format.audio.channels;
-            int bytesPerSample = av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format.audio.format));
-            int frameSizeTotal = out.payload->length();
-            
-            CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-            int numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-            outFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
-            expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
-        }
-        
-        std::cout << "Successfully allocated tensors to the dimension \n" ;
-        // if not in seek mode or only looking at the keyframes, 
-        // return the immediate next frame 
-        if ((seekTS == -1) || (video_any_frame == false)) {   
-
-            std::cout << "In non-seek mode stuff is happening \n";         
-            if (format.type == TYPE_VIDEO) {
-                auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-            } else {
-                auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
-            }
-            out.payload.reset();
-        }
+  // if failing to decode simply return 0 (note, maybe
+  // raise an exeption otherwise)
+  torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
+
+  // first decode the frame
+  DecoderOutputMessage out;
+  int64_t res = decoder.decode(&out, decoderTimeoutMs);
+  if (res == 0) {
+    auto header = out.header;
+    const auto& format = header.format;
+
+    // then initialize the output variables based on type
+    size_t expectedWrittenBytes = 0;
+
+    if (format.type == TYPE_VIDEO) {
+      int outHeight = format.format.video.height;
+      int outWidth = format.format.video.width;
+      int numChannels = 3;
+      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+      expectedWrittenBytes = outHeight * outWidth * numChannels;
+      // std::cout << expectedWrittenBytes;
+    } else if (format.type == TYPE_AUDIO) {
+      int outAudioChannels = format.format.audio.channels;
+      int bytesPerSample = av_get_bytes_per_sample(
+          static_cast<AVSampleFormat>(format.format.audio.format));
+      int frameSizeTotal = out.payload->length();
+
+      CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+      int numAudioSamples =
+          frameSizeTotal / (outAudioChannels * bytesPerSample);
+
+      outFrame =
+          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+
+      expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
-    else {
-        LOG(ERROR) << "Decoder run into a last iteration or has failed";
+
+    // std::cout << "Successfully allocated tensors to the dimension \n";
+    // if not in seek mode or only looking at the keyframes,
+    // return the immediate next frame
+    if ((seekTS == -1) || (video_any_frame == false)) {
+      // std::cout << "In non-seek mode stuff is happening \n";
+      if (format.type == TYPE_VIDEO) {
+        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+      } else {
+        auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
+      }
+      out.payload.reset();
     }
+  } else {
+    LOG(ERROR) << "Decoder run into a last iteration or has failed";
+  }
 
-    torch::List<torch::Tensor> result;
-    result.push_back(outFrame);
-    result.push_back(framePTS);
-    return result;
+  torch::List<torch::Tensor> result;
+  result.push_back(outFrame);
+  result.push_back(framePTS);
+  return result;
 }
 
+Video::~Video() {
+//   delete params; // does not have destructor
+//   delete metadata; // struct does not have destructor
+//   delete decoder; // should be fine
+//   delete streamFPS; // should be fine
+//   delete streamDuration; // should be fine
+}
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index a03bd88d6cd..6d96e3262cd 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -28,25 +28,27 @@ using namespace ffmpeg;
 struct Video : torch::CustomClassHolder {
     bool video_any_frame=false; // add this to input parameters
     bool succeeded=false; // this is decoder init stuff
-    // this acts as a flag - if it's not set, next function simply
+    // seekTS acts as a flag - if it's not set, next function simply
     // retruns the next frame. If it's set, we look at the global seek
     // time in comination with any_frame settings
     double seekTS=-1; 
-    std::tuple<std::string, int64_t> current_stream;
+    bool doSeek=false;
+    std::tuple<std::string, long> current_stream;
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
     DecoderMetadata videoMetadata;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
+        ~Video();
         std::tuple<std::string, int64_t> getCurrentStream() const;
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
-        int64_t Seek(double ts, std::string stream, bool any_frame);
+        void Seek(double ts, bool any_frame);
         torch::List<torch::Tensor> Next(std::string stream); //
 
     private:
         void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-        bool _setCurrentStream(std::string stream="video");
+        bool _setCurrentStream();
         std::map<std::string, std::vector<double>> streamTimeBase;
 
         SyncDecoder decoder;
@@ -60,8 +62,6 @@ struct Video : torch::CustomClassHolder {
     protected:
         // AV container type (check in decoder for exact type)
 
-        // int64_t SecToStream(double ts); // TODO: add stream type
-        // double StreamToSec(int64_t pts); // TODO: add stream type
     
 }; // struct Video
 

From 7c56040b3918ffde03db9b3dfc60bef1455eef56 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 16 Sep 2020 07:19:01 -0500
Subject: [PATCH 069/128] cleanup and comments

---
 torchvision/csrc/cpu/video/Video.cpp | 46 +++++++++++++---------------
 torchvision/csrc/cpu/video/Video.h   | 38 +++++++++++------------
 2 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d711325a498..f31b1614167 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -227,7 +227,6 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
       double duration = double(header.duration) * 1e-6; // * timeBase;
 
       if (header.format.type == TYPE_VIDEO) {
-        videoMetadata = header;
         videoFPS.push_back(fps);
         videoDuration.push_back(duration);
       } else if (header.format.type == TYPE_AUDIO) {
@@ -307,8 +306,8 @@ void Video::Seek(double ts, bool any_frame = false) {
 }
 
 torch::List<torch::Tensor> Video::Next(std::string stream) {
-
-  bool newInit = false;
+  
+  bool newInit = false; // avoid unnecessary decoder initializations
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
       current_stream = _parseStream(stream);
       newInit = true;
@@ -323,32 +322,34 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
     succeeded = Video::_setCurrentStream();
     if (succeeded) {
       newInit = false;
-      // cout << "Reinitializing the decoder again \n";
     }
   }
 
-  // if failing to decode simply return 0 (note, maybe
-  // raise an exeption otherwise)
+  // if failing to decode simply return a null tensor (note, should we
+  // raise an exeption?)
   torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
-  // first decode the frame
+  // decode single frame
   DecoderOutputMessage out;
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
+  // if successfull
   if (res == 0) {
     auto header = out.header;
     const auto& format = header.format;
 
-    // then initialize the output variables based on type
+    // initialize the output variables based on type
     size_t expectedWrittenBytes = 0;
 
     if (format.type == TYPE_VIDEO) {
+      // note: this can potentially be optimized
+      // by having the global tensor that we fill at decode time
+      // (would avoid allocations)
       int outHeight = format.format.video.height;
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
       expectedWrittenBytes = outHeight * outWidth * numChannels;
-      // std::cout << expectedWrittenBytes;
     } else if (format.type == TYPE_AUDIO) {
       int outAudioChannels = format.format.audio.channels;
       int bytesPerSample = av_get_bytes_per_sample(
@@ -364,21 +365,17 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
 
       expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
+    // currently not supporting other formats (will do soon)
 
-    // std::cout << "Successfully allocated tensors to the dimension \n";
-    // if not in seek mode or only looking at the keyframes,
-    // return the immediate next frame
-    if ((seekTS == -1) || (video_any_frame == false)) {
-      // std::cout << "In non-seek mode stuff is happening \n";
-      if (format.type == TYPE_VIDEO) {
-        auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-      } else {
-        auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
-      }
-      out.payload.reset();
+    // note: this will need to be revised to support less-accurate seek. So far keep as is
+    if (format.type == TYPE_VIDEO) {
+      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+    } else {
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
     }
-  } else {
-    LOG(ERROR) << "Decoder run into a last iteration or has failed";
+    out.payload.reset();
+  } else{
+    LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
   }
 
   torch::List<torch::Tensor> result;
@@ -387,10 +384,11 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
   return result;
 }
 
-Video::~Video() {
+// Video::~Video() {
+  // destructor to be defined thoroughly later
 //   delete params; // does not have destructor
 //   delete metadata; // struct does not have destructor
 //   delete decoder; // should be fine
 //   delete streamFPS; // should be fine
 //   delete streamDuration; // should be fine
-}
+// }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 6d96e3262cd..92f3edcc686 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -26,42 +26,40 @@ using namespace ffmpeg;
 
 
 struct Video : torch::CustomClassHolder {
-    bool video_any_frame=false; // add this to input parameters
-    bool succeeded=false; // this is decoder init stuff
-    // seekTS acts as a flag - if it's not set, next function simply
-    // retruns the next frame. If it's set, we look at the global seek
-    // time in comination with any_frame settings
-    double seekTS=-1; 
-    bool doSeek=false;
-    std::tuple<std::string, long> current_stream;
+    
+    
+    
+    std::tuple<std::string, long> current_stream; // streaam type, id
+    // global video metadata
     std::map<std::string, std::vector<double>> streamFPS;
     std::map<std::string, std::vector<double>> streamDuration;
-    DecoderMetadata videoMetadata;
     public:
         Video(std::string videoPath, std::string stream, bool isReadFile);
-        ~Video();
         std::tuple<std::string, int64_t> getCurrentStream() const;
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
         void Seek(double ts, bool any_frame);
-        torch::List<torch::Tensor> Next(std::string stream); //
+        torch::List<torch::Tensor> Next(std::string stream);
 
     private:
+        bool video_any_frame=false; // add this to input parameters?
+        bool succeeded=false; // decoder init flag
+        // seekTS and doSeek act as a flag - if it's not set, next function simply
+        // retruns the next frame. If it's set, we look at the global seek
+        // time in comination with any_frame settings
+        double seekTS=-1; 
+        bool doSeek=false;
+
         void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
         bool _setCurrentStream();
-        std::map<std::string, std::vector<double>> streamTimeBase;
-
-        SyncDecoder decoder;
-        DecoderParameters params;
+        std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
         DecoderInCallback callback = nullptr;;
         std::vector<DecoderMetadata> metadata;
-        
-        
-        // torch::List<torch::Tensor> Peak(std::string stream="")
+                
     protected:
-        // AV container type (check in decoder for exact type)
-
+        SyncDecoder decoder;
+        DecoderParameters params;
     
 }; // struct Video
 

From 262a6475d6c274b5f3925e9c940c8874eadd6f46 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 17 Sep 2020 07:06:05 -0500
Subject: [PATCH 070/128] introducing new tests for the API

---
 test/test_video.py                   | 160 +++++++++++++++++++++++++++
 torchvision/csrc/cpu/video/Video.cpp |   1 +
 2 files changed, 161 insertions(+)
 create mode 100644 test/test_video.py

diff --git a/test/test_video.py b/test/test_video.py
new file mode 100644
index 00000000000..76a0dd60c06
--- /dev/null
+++ b/test/test_video.py
@@ -0,0 +1,160 @@
+import os
+import collections
+import contextlib
+import tempfile
+import unittest
+
+
+import numpy as np
+
+import torch
+import torchvision
+from torchvision.io import _HAS_VIDEO_OPT
+
+
+
+VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
+
+CheckerConfig = [
+    "duration",
+    "video_fps",
+    "audio_sample_rate",
+    # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are
+    # slightly different between TorchVision decoder and PyAv decoder. So omit it during check
+    "check_aframes",
+    "check_aframe_pts",
+]
+GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
+
+all_check_config = GroundTruth(
+    duration=0,
+    video_fps=0,
+    audio_sample_rate=0,
+    check_aframes=True,
+    check_aframe_pts=True,
+)
+
+test_videos = {
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(
+        duration=2.0,
+        video_fps=30.0,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
+        duration=2.0,
+        video_fps=30.0,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(
+        duration=2.0,
+        video_fps=30.0,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "v_SoccerJuggling_g23_c01.avi": GroundTruth(
+        duration=8.0,
+        video_fps=29.97,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    "v_SoccerJuggling_g24_c01.avi": GroundTruth(
+        duration=8.0,
+        video_fps=29.97,
+        audio_sample_rate=None,
+        check_aframes=True,
+        check_aframe_pts=True,
+    ),
+    ### Last three test segfault on video reader (see issues)
+    # "R6llTwEh07w.mp4": GroundTruth(
+    #     duration=10.0,
+    #     video_fps=30.0,
+    #     audio_sample_rate=44100,
+    #     # PyAv miss one audio frame at the beginning (pts=0)
+    #     check_aframes=False,
+    #     check_aframe_pts=False,
+    # ),
+    # "SOX5yA1l24A.mp4": GroundTruth(
+    #     duration=11.0,
+    #     video_fps=29.97,
+    #     audio_sample_rate=48000,
+    #     # PyAv miss one audio frame at the beginning (pts=0)
+    #     check_aframes=False,
+    #     check_aframe_pts=False,
+    # ),
+    # "WUzgd7C1pWA.mp4": GroundTruth(
+    #     duration=11.0,
+    #     video_fps=29.97,
+    #     audio_sample_rate=48000,
+    #     # PyAv miss one audio frame at the beginning (pts=0)
+    #     check_aframes=False,
+    #     check_aframe_pts=False,
+    # ),
+}
+
+
+@unittest.skipIf(_HAS_VIDEO_OPT is False, "Didn't compile with ffmpeg")
+class TestVideo(unittest.TestCase):
+    def test_read_video_tensor(self):
+        """
+        Check if reading the video using the `next` based API yields the
+        same sized and equal tensors as video_reader.
+        """
+        print("test read")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            print(test_video)
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            # pass 1: decode all frames using existing TV decoder
+            tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
+            # pass 2: decode all frames using new api
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            frames = []
+            t, _ = reader.next("")
+            while t.numel() > 0:
+                frames.append(t)
+                t, _ = reader.next("")
+            new_api = torch.stack(frames, 0)
+            self.assertEqual(tv_result.size(), new_api.size())
+            self.assertEqual(torch.equal(tv_result, new_api), True)
+    
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_pts(self):
+        """Check if the frames have the same timestamps
+        """
+        print("test timestamp")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            tv_timestamps, _ =  torchvision.io.read_video_timestamps(full_path, pts_unit='sec')
+            # pass 2: decode all frames using new api
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            pts = []
+            t, p = reader.next("")
+            while t.numel() > 0:
+                pts.append(p)
+                t, p = reader.next("")
+            
+            tv_timestamps = [float(p) for p in tv_timestamps]
+            napi_pts = [float(p.item()) for p in pts]
+            for i in range(len(napi_pts)):
+                self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
+
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_metadata(self):
+        print("test fps")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            self.assertAlmostEqual(config.video_fps, reader.fps("")[0], delta=0.0001)
+            self.assertAlmostEqual(config.duration, reader.duration("")[0], delta=0.5)
+      
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index f31b1614167..2e8e5ac9b88 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -188,6 +188,7 @@ void Video::_getDecoderParams(
       format.format.video.width = 0;
       format.format.video.height = 0;
       format.format.video.cropImage = 0;
+      format.format.video.format = defaultVideoPixelFormat;
     }
     params.formats.insert(format);
   }

From a7c531c582a2bab1638266cdf8abc657828e95a8 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 17 Sep 2020 07:06:22 -0500
Subject: [PATCH 071/128] cleanup

---
 video_reader.todo | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 video_reader.todo

diff --git a/video_reader.todo b/video_reader.todo
deleted file mode 100644
index cc9bc343f23..00000000000
--- a/video_reader.todo
+++ /dev/null
@@ -1,19 +0,0 @@
-Documented edgecases that don't work:
-    ☐ seeking with anyframe=True
-    ✔ last frame segfaults - exit cleanly @started(20-09-02 10:44) @done(20-09-02 11:07) @lasted(23m1s)
-    ✔ switching modalities in the subsequent calls to next() @done(20-09-03 05:33)
-    ```
-    video.next("video:0")
-    video.next("video:0")
-    video.next("audio:0")
-    ```
-
-
-Random todo's:
-    ✔ add check for the current stream @done(20-09-02 06:37)
-    ☐ ensure warning if stream is out of bounds
-    ✔ warning if stream is not auto selct @done(20-09-03 05:33)
-    ☐ can we make this an iterable?
-    ☐ destructors
-    ☐ adding tests to test.py
-    ☐ thorough checking for memory leaks
\ No newline at end of file

From ce8665c8cf16063c488e14db19c3985167628f56 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 04:53:45 -0500
Subject: [PATCH 072/128] Comment out unnecesary format (will add following
 FFMPEG fix)

---
 torchvision/csrc/cpu/video/Video.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 2e8e5ac9b88..1ca677787e4 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -188,7 +188,7 @@ void Video::_getDecoderParams(
       format.format.video.width = 0;
       format.format.video.height = 0;
       format.format.video.cropImage = 0;
-      format.format.video.format = defaultVideoPixelFormat;
+      // format.format.video.format = defaultVideoPixelFormat;
     }
     params.formats.insert(format);
   }

From ad65f132e8b3955c78314d6af8127f860e68998a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:07:45 -0500
Subject: [PATCH 073/128] Reformat parsing function

---
 torchvision/csrc/cpu/video/Video.cpp | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 1ca677787e4..123410150c1 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -25,6 +25,7 @@ using namespace ffmpeg;
 // #endif
 // #endif
 
+
 const size_t decoderTimeoutMs = 600000;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
@@ -69,7 +70,7 @@ size_t fillAudioTensor(
   return fillTensorList<float>(msgs, audioFrame, audioFramePts);
 }
 
-std::string parse_type_to_string(const std::string& stream_string) {
+std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ffmpeg::MediaType> const* _parse_type(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
       {"audio", TYPE_AUDIO},
@@ -83,28 +84,19 @@ std::string parse_type_to_string(const std::string& stream_string) {
         return p.first == stream_string;
       });
   if (device != types.end()) {
-    return device->first;
+    return device;
   }
   AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
+std::string parse_type_to_string(const std::string& stream_string){
+    auto device = _parse_type(stream_string);
+    return device->first;
+}
+
 MediaType parse_type_to_mt(const std::string& stream_string) {
-  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
-      {"video", TYPE_VIDEO},
-      {"audio", TYPE_AUDIO},
-      {"subtitle", TYPE_SUBTITLE},
-      {"cc", TYPE_CC},
-  }};
-  auto device = std::find_if(
-      types.begin(),
-      types.end(),
-      [stream_string](const std::pair<std::string, MediaType>& p) {
-        return p.first == stream_string;
-      });
-  if (device != types.end()) {
+    auto device = _parse_type(stream_string);
     return device->second;
-  }
-  AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
 std::tuple<std::string, long> _parseStream(const std::string& streamString) {

From fd29585e6e07c9150a9b33adaa2d88bc6168e7c1 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:11:29 -0500
Subject: [PATCH 074/128] removing the seek bug `get_decoder_params`

---
 torchvision/csrc/cpu/video/Video.cpp | 3 ++-
 torchvision/csrc/cpu/video/Video.h   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 123410150c1..0436bb71217 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -130,12 +130,13 @@ std::tuple<std::string, long> _parseStream(const std::string& streamString) {
 
 void Video::_getDecoderParams(
 
-    int64_t videoStartS,
+    double videoStartS,
     int64_t getPtsOnly,
     std::string stream,
     long stream_id = -1,
     bool all_streams = false,
     double seekFrameMarginUs = 10) {
+  
   int64_t videoStartUs = int64_t(videoStartS * 1e6);
 
   params.timeoutMs = decoderTimeoutMs;
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 92f3edcc686..7435c015be4 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -50,7 +50,7 @@ struct Video : torch::CustomClassHolder {
         double seekTS=-1; 
         bool doSeek=false;
 
-        void _getDecoderParams(int64_t videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
+        void _getDecoderParams(double videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
         bool _setCurrentStream();
         std::map<std::string, std::vector<double>> streamTimeBase; // not used
 

From 21e73a88c5e6127d79646f4213698faba4983350 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:19:30 -0500
Subject: [PATCH 075/128] Removing unnecessary code/variables

---
 torchvision/csrc/cpu/video/Video.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 0436bb71217..66e142d632c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -141,7 +141,7 @@ void Video::_getDecoderParams(
 
   params.timeoutMs = decoderTimeoutMs;
   params.startOffset = videoStartUs;
-  params.seekAccuracy = 10;
+  params.seekAccuracy = seekFrameMarginUs;
   params.headerOnly = false;
 
   params.preventStaleness = false; // not sure what this is about
@@ -333,7 +333,6 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
     const auto& format = header.format;
 
     // initialize the output variables based on type
-    size_t expectedWrittenBytes = 0;
 
     if (format.type == TYPE_VIDEO) {
       // note: this can potentially be optimized
@@ -343,7 +342,6 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      expectedWrittenBytes = outHeight * outWidth * numChannels;
     } else if (format.type == TYPE_AUDIO) {
       int outAudioChannels = format.format.audio.channels;
       int bytesPerSample = av_get_bytes_per_sample(
@@ -357,7 +355,6 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
 
-      expectedWrittenBytes = numAudioSamples * outAudioChannels * sizeof(float);
     }
     // currently not supporting other formats (will do soon)
 

From 9cb8af5b744c6cf0333fb680318a08780f94de0d Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:23:19 -0500
Subject: [PATCH 076/128] enforce RGB24 as a reading format (will crash before
 ffmpeg fix)

---
 torchvision/csrc/cpu/video/Video.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 66e142d632c..6fee40aa17d 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -27,6 +27,7 @@ using namespace ffmpeg;
 
 
 const size_t decoderTimeoutMs = 600000;
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 // A jitter can be added to the end of the range to avoid conversion/rounding
 // error, small value 100us won't be enough to select the next frame, but enough
@@ -181,7 +182,7 @@ void Video::_getDecoderParams(
       format.format.video.width = 0;
       format.format.video.height = 0;
       format.format.video.cropImage = 0;
-      // format.format.video.format = defaultVideoPixelFormat;
+      format.format.video.format = defaultVideoPixelFormat;
     }
     params.formats.insert(format);
   }

From e6c8ccc189be6946f2c8be9b927558d69e82f7ed Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:40:32 -0500
Subject: [PATCH 077/128] permute the dimensions to return (RGB x H x W)

---
 test/test_video.py                   |  1 +
 torchvision/csrc/cpu/video/Video.cpp | 12 +++++-------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 76a0dd60c06..c7afb6a8aa9 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -112,6 +112,7 @@ def test_read_video_tensor(self):
             full_path = os.path.join(VIDEO_DIR, test_video)
             # pass 1: decode all frames using existing TV decoder
             tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
+            tv_result = tv_result.permute(0, 3, 1, 2)
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video", True)
             frames = []
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 6fee40aa17d..be8b1d2afce 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -343,6 +343,9 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
+      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+      outFrame = outFrame.permute({2, 0, 1});
+
     } else if (format.type == TYPE_AUDIO) {
       int outAudioChannels = format.format.audio.channels;
       int bytesPerSample = av_get_bytes_per_sample(
@@ -355,16 +358,11 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
 
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
+      
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS); 
     }
     // currently not supporting other formats (will do soon)
 
-    // note: this will need to be revised to support less-accurate seek. So far keep as is
-    if (format.type == TYPE_VIDEO) {
-      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
-    } else {
-      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS);
-    }
     out.payload.reset();
   } else{
     LOG(ERROR) << "Decoder failed ( or ran into last iteration)";

From 20edb6c8e57bd64b81e69f157846f7d55d0b09a0 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:54:53 -0500
Subject: [PATCH 078/128] Changing the return type to std::tuple<torch::Tensor,
 double> as opposed to tensor list

---
 torchvision/csrc/cpu/video/Video.cpp | 37 +++++++++++-----------------
 torchvision/csrc/cpu/video/Video.h   |  2 +-
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index be8b1d2afce..c6d556c0356 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -38,18 +38,10 @@ const size_t timeBaseJitterUs = 100;
 template <typename T>
 size_t fillTensorList(
     DecoderOutputMessage& msgs,
-    torch::Tensor& frame,
-    torch::Tensor& framePts) {
-  // set up PTS data
+    torch::Tensor& frame) {
+  
   const auto& msg = msgs;
-
-  float* framePtsData = framePts.data_ptr<float>();
-
-  float pts_s = float(float(msg.header.pts) * 1e-6);
-  framePtsData[0] = pts_s;
-
   T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-
   if (frameData) {
     auto sizeInBytes = msg.payload->length();
     memcpy(frameData, msg.payload->data(), sizeInBytes);
@@ -59,16 +51,14 @@ size_t fillTensorList(
 
 size_t fillVideoTensor(
     DecoderOutputMessage& msgs,
-    torch::Tensor& videoFrame,
-    torch::Tensor& videoFramePts) {
-  return fillTensorList<uint8_t>(msgs, videoFrame, videoFramePts);
+    torch::Tensor& videoFrame) {
+  return fillTensorList<uint8_t>(msgs, videoFrame);
 }
 
 size_t fillAudioTensor(
     DecoderOutputMessage& msgs,
-    torch::Tensor& audioFrame,
-    torch::Tensor& audioFramePts) {
-  return fillTensorList<float>(msgs, audioFrame, audioFramePts);
+    torch::Tensor& audioFrame) {
+  return fillTensorList<float>(msgs, audioFrame);
 }
 
 std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ffmpeg::MediaType> const* _parse_type(const std::string& stream_string) {
@@ -300,7 +290,7 @@ void Video::Seek(double ts, bool any_frame = false) {
   doSeek = true;
 }
 
-torch::List<torch::Tensor> Video::Next(std::string stream) {
+std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
   
   bool newInit = false; // avoid unnecessary decoder initializations
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
@@ -322,7 +312,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
 
   // if failing to decode simply return a null tensor (note, should we
   // raise an exeption?)
-  torch::Tensor framePTS = torch::zeros({1}, torch::kFloat);
+  double frame_pts_s;
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
   // decode single frame
@@ -330,6 +320,9 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
   // if successfull
   if (res == 0) {
+
+    frame_pts_s = double(double(out.header.pts) * 1e-6);
+    
     auto header = out.header;
     const auto& format = header.format;
 
@@ -343,7 +336,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       int outWidth = format.format.video.width;
       int numChannels = 3;
       outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      auto numberWrittenBytes = fillVideoTensor(out, outFrame, framePTS);
+      auto numberWrittenBytes = fillVideoTensor(out, outFrame);
       outFrame = outFrame.permute({2, 0, 1});
 
     } else if (format.type == TYPE_AUDIO) {
@@ -359,7 +352,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
       
-      auto numberWrittenBytes = fillAudioTensor(out, outFrame, framePTS); 
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame); 
     }
     // currently not supporting other formats (will do soon)
 
@@ -368,9 +361,7 @@ torch::List<torch::Tensor> Video::Next(std::string stream) {
     LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
   }
 
-  torch::List<torch::Tensor> result;
-  result.push_back(outFrame);
-  result.push_back(framePTS);
+  std::tuple<torch::Tensor, double> result = {outFrame, frame_pts_s};
   return result;
 }
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 7435c015be4..b76831d6770 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -39,7 +39,7 @@ struct Video : torch::CustomClassHolder {
         std::vector<double> getDuration(std::string stream="") const;
         std::vector<double> getFPS(std::string stream="") const;
         void Seek(double ts, bool any_frame);
-        torch::List<torch::Tensor> Next(std::string stream);
+        std::tuple<torch::Tensor, double> Next(std::string stream);
 
     private:
         bool video_any_frame=false; // add this to input parameters?

From 23f9a4f5d08e22faf6fc9cb97cc3b3b5d2137837 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 05:55:17 -0500
Subject: [PATCH 079/128] Adjusting tests for the new return type

---
 test/test_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_video.py b/test/test_video.py
index c7afb6a8aa9..32a36b63cb0 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -143,7 +143,7 @@ def test_pts(self):
                 t, p = reader.next("")
             
             tv_timestamps = [float(p) for p in tv_timestamps]
-            napi_pts = [float(p.item()) for p in pts]
+            napi_pts = [float(p) for p in pts]
             for i in range(len(napi_pts)):
                 self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
 

From 5e21664c3a79e3db11eba355f1b829e53ed5da27 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 06:00:01 -0500
Subject: [PATCH 080/128] remove unnecessary jitter

---
 torchvision/csrc/cpu/video/Video.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index c6d556c0356..1d271c52198 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -29,10 +29,6 @@ using namespace ffmpeg;
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
-// A jitter can be added to the end of the range to avoid conversion/rounding
-// error, small value 100us won't be enough to select the next frame, but enough
-// to compensate rounding error due to the multiple conversions.
-const size_t timeBaseJitterUs = 100;
 
 // returns number of written bytes
 template <typename T>

From 5d7a0df7dcdd0a596f9d2e9fa8da265a31a2d8d3 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 23 Sep 2020 06:13:37 -0500
Subject: [PATCH 081/128] clangangangang

---
 torchvision/csrc/cpu/video/Video.cpp    | 53 +++++++---------
 torchvision/csrc/cpu/video/Video.h      | 84 ++++++++++++-------------
 torchvision/csrc/cpu/video/register.cpp |  2 +-
 3 files changed, 66 insertions(+), 73 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 1d271c52198..9daf70548d7 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -25,17 +25,13 @@ using namespace ffmpeg;
 // #endif
 // #endif
 
-
 const size_t decoderTimeoutMs = 600000;
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
 
 // returns number of written bytes
 template <typename T>
-size_t fillTensorList(
-    DecoderOutputMessage& msgs,
-    torch::Tensor& frame) {
-  
+size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
   const auto& msg = msgs;
   T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
   if (frameData) {
@@ -45,19 +41,19 @@ size_t fillTensorList(
   return sizeof(T);
 }
 
-size_t fillVideoTensor(
-    DecoderOutputMessage& msgs,
-    torch::Tensor& videoFrame) {
+size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
   return fillTensorList<uint8_t>(msgs, videoFrame);
 }
 
-size_t fillAudioTensor(
-    DecoderOutputMessage& msgs,
-    torch::Tensor& audioFrame) {
+size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
   return fillTensorList<float>(msgs, audioFrame);
 }
 
-std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ffmpeg::MediaType> const* _parse_type(const std::string& stream_string) {
+std::pair<
+    std::__cxx11::
+        basic_string<char, std::char_traits<char>, std::allocator<char>>,
+    ffmpeg::MediaType> const*
+_parse_type(const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
       {"audio", TYPE_AUDIO},
@@ -76,14 +72,14 @@ std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocato
   AT_ERROR("Expected one of [audio, video, subtitle, cc] ", stream_string);
 }
 
-std::string parse_type_to_string(const std::string& stream_string){
-    auto device = _parse_type(stream_string);
-    return device->first;
+std::string parse_type_to_string(const std::string& stream_string) {
+  auto device = _parse_type(stream_string);
+  return device->first;
 }
 
 MediaType parse_type_to_mt(const std::string& stream_string) {
-    auto device = _parse_type(stream_string);
-    return device->second;
+  auto device = _parse_type(stream_string);
+  return device->second;
 }
 
 std::tuple<std::string, long> _parseStream(const std::string& streamString) {
@@ -123,7 +119,6 @@ void Video::_getDecoderParams(
     long stream_id = -1,
     bool all_streams = false,
     double seekFrameMarginUs = 10) {
-  
   int64_t videoStartUs = int64_t(videoStartS * 1e6);
 
   params.timeoutMs = decoderTimeoutMs;
@@ -287,19 +282,18 @@ void Video::Seek(double ts, bool any_frame = false) {
 }
 
 std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
-  
   bool newInit = false; // avoid unnecessary decoder initializations
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-      current_stream = _parseStream(stream);
-      newInit = true;
+    current_stream = _parseStream(stream);
+    newInit = true;
   }
 
   if ((seekTS != -1) && (doSeek == true)) {
-      newInit = true;
-      doSeek = false;
+    newInit = true;
+    doSeek = false;
   }
 
-  if (newInit){
+  if (newInit) {
     succeeded = Video::_setCurrentStream();
     if (succeeded) {
       newInit = false;
@@ -316,9 +310,8 @@ std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
   // if successfull
   if (res == 0) {
-
     frame_pts_s = double(double(out.header.pts) * 1e-6);
-    
+
     auto header = out.header;
     const auto& format = header.format;
 
@@ -347,13 +340,13 @@ std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
 
       outFrame =
           torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-      
-      auto numberWrittenBytes = fillAudioTensor(out, outFrame); 
+
+      auto numberWrittenBytes = fillAudioTensor(out, outFrame);
     }
     // currently not supporting other formats (will do soon)
 
     out.payload.reset();
-  } else{
+  } else {
     LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
   }
 
@@ -362,7 +355,7 @@ std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
 }
 
 // Video::~Video() {
-  // destructor to be defined thoroughly later
+// destructor to be defined thoroughly later
 //   delete params; // does not have destructor
 //   delete metadata; // struct does not have destructor
 //   delete decoder; // should be fine
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index b76831d6770..1fd07901665 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -3,11 +3,10 @@
 #ifndef VIDEO_H_
 #define VIDEO_H_
 
-
+#include <map>
+#include <regex>
 #include <string>
 #include <vector>
-#include <regex>
-#include <map>
 
 #include <ATen/ATen.h>
 #include <Python.h>
@@ -15,53 +14,54 @@
 #include <torch/script.h>
 
 
-
 #include <exception>
-#include "sync_decoder.h"
-#include "memory_buffer.h"
 #include "defs.h"
-#include "util.h"
+#include "memory_buffer.h"
+#include "sync_decoder.h"
 
 using namespace ffmpeg;
 
-
 struct Video : torch::CustomClassHolder {
-    
-    
-    
-    std::tuple<std::string, long> current_stream; // streaam type, id
-    // global video metadata
-    std::map<std::string, std::vector<double>> streamFPS;
-    std::map<std::string, std::vector<double>> streamDuration;
-    public:
-        Video(std::string videoPath, std::string stream, bool isReadFile);
-        std::tuple<std::string, int64_t> getCurrentStream() const;
-        std::vector<double> getDuration(std::string stream="") const;
-        std::vector<double> getFPS(std::string stream="") const;
-        void Seek(double ts, bool any_frame);
-        std::tuple<torch::Tensor, double> Next(std::string stream);
+  std::tuple<std::string, long> current_stream; // streaam type, id
+  // global video metadata
+  std::map<std::string, std::vector<double>> streamFPS;
+  std::map<std::string, std::vector<double>> streamDuration;
 
-    private:
-        bool video_any_frame=false; // add this to input parameters?
-        bool succeeded=false; // decoder init flag
-        // seekTS and doSeek act as a flag - if it's not set, next function simply
-        // retruns the next frame. If it's set, we look at the global seek
-        // time in comination with any_frame settings
-        double seekTS=-1; 
-        bool doSeek=false;
+ public:
+  Video(std::string videoPath, std::string stream, bool isReadFile);
+  std::tuple<std::string, int64_t> getCurrentStream() const;
+  std::vector<double> getDuration(std::string stream = "") const;
+  std::vector<double> getFPS(std::string stream = "") const;
+  void Seek(double ts, bool any_frame);
+  std::tuple<torch::Tensor, double> Next(std::string stream);
 
-        void _getDecoderParams(double videoStartS, int64_t getPtsOnly, std::string stream, long stream_id, bool all_streams, double seekFrameMarginUs); // this needs to be improved
-        bool _setCurrentStream();
-        std::map<std::string, std::vector<double>> streamTimeBase; // not used
+ private:
+  bool video_any_frame = false; // add this to input parameters?
+  bool succeeded = false; // decoder init flag
+  // seekTS and doSeek act as a flag - if it's not set, next function simply
+  // retruns the next frame. If it's set, we look at the global seek
+  // time in comination with any_frame settings
+  double seekTS = -1;
+  bool doSeek = false;
 
-        DecoderInCallback callback = nullptr;;
-        std::vector<DecoderMetadata> metadata;
-                
-    protected:
-        SyncDecoder decoder;
-        DecoderParameters params;
-    
-}; // struct Video
+  void _getDecoderParams(
+      double videoStartS,
+      int64_t getPtsOnly,
+      std::string stream,
+      long stream_id,
+      bool all_streams,
+      double seekFrameMarginUs); // this needs to be improved
+  bool _setCurrentStream();
+  std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
+  DecoderInCallback callback = nullptr;
+  ;
+  std::vector<DecoderMetadata> metadata;
+
+ protected:
+  SyncDecoder decoder;
+  DecoderParameters params;
+
+}; // struct Video
 
-#endif  // VIDEO_H_
+#endif // VIDEO_H_
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 091052f4808..cfd2d2dd5c1 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -14,5 +14,5 @@ static auto registerVideo =
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);
 
-} //namespace
+} // namespace
 #endif

From 0720b2ab802f4d842f9f8357278db6e9bbb382f5 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 24 Sep 2020 10:43:02 +0100
Subject: [PATCH 082/128] Metadata return changes (#1)

---
 test/test_video.py                      |  6 ++---
 torchvision/csrc/cpu/video/Video.cpp    | 31 +++++++------------------
 torchvision/csrc/cpu/video/Video.h      |  6 ++---
 torchvision/csrc/cpu/video/register.cpp |  3 +--
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 32a36b63cb0..05e75e2f598 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -108,7 +108,6 @@ def test_read_video_tensor(self):
         print("test read")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
-            print(test_video)
             full_path = os.path.join(VIDEO_DIR, test_video)
             # pass 1: decode all frames using existing TV decoder
             tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
@@ -154,8 +153,9 @@ def test_metadata(self):
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
             reader = torch.classes.torchvision.Video(full_path, "video", True)
-            self.assertAlmostEqual(config.video_fps, reader.fps("")[0], delta=0.0001)
-            self.assertAlmostEqual(config.duration, reader.duration("")[0], delta=0.5)
+            reader_md = reader.get_metadata()
+            self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
+            self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
       
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 9daf70548d7..31f409009b5 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -190,9 +190,13 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   logType = "file";
   logMessage = videoPath;
 
+  // locals
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
+  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
+  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
+
 
   // calback and metadata defined in struct
   succeeded = decoder.init(params, std::move(callback), &metadata);
@@ -217,8 +221,9 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
       };
     }
   }
-  streamFPS.insert({{"video", videoFPS}, {"audio", audioFPS}});
-  streamDuration.insert({{"video", videoDuration}, {"audio", audioDuration}});
+  audioMetadata.insert({{"duration", audioDuration}, {"framerate", audioFPS}});
+  videoMetadata.insert({{"duration", videoDuration}, {"fps", videoFPS}});
+  streamsMetadata.insert({{"video", videoMetadata}, {"audio", audioMetadata}});
 
   succeeded = Video::_setCurrentStream();
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
@@ -252,26 +257,8 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
   return current_stream;
 }
 
-std::vector<double> Video::getFPS(std::string stream) const {
-  // add safety check
-  if (stream.empty()) {
-    stream = get<0>(current_stream);
-  }
-  auto stream_tpl = _parseStream(stream);
-  std::string stream_str = get<0>(stream_tpl);
-  // check if the stream exists
-  return streamFPS.at(stream_str);
-}
-
-std::vector<double> Video::getDuration(std::string stream) const {
-  // add safety check
-  if (stream.empty()) {
-    stream = get<0>(current_stream);
-  }
-  auto stream_tpl = _parseStream(stream);
-  std::string stream_str = get<0>(stream_tpl);
-  // check if the stream exists
-  return streamDuration.at(stream_str);
+std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
+  return streamsMetadata;
 }
 
 void Video::Seek(double ts, bool any_frame = false) {
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 1fd07901665..5af73fe5330 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -24,14 +24,12 @@ using namespace ffmpeg;
 struct Video : torch::CustomClassHolder {
   std::tuple<std::string, long> current_stream; // streaam type, id
   // global video metadata
-  std::map<std::string, std::vector<double>> streamFPS;
-  std::map<std::string, std::vector<double>> streamDuration;
+  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
 
  public:
   Video(std::string videoPath, std::string stream, bool isReadFile);
   std::tuple<std::string, int64_t> getCurrentStream() const;
-  std::vector<double> getDuration(std::string stream = "") const;
-  std::vector<double> getFPS(std::string stream = "") const;
+  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
   void Seek(double ts, bool any_frame);
   std::tuple<torch::Tensor, double> Next(std::string stream);
 
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index cfd2d2dd5c1..8f3f46072f5 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -9,8 +9,7 @@ static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
         .def("get_current_stream", &Video::getCurrentStream)
-        .def("duration", &Video::getDuration)
-        .def("fps", &Video::getFPS)
+        .def("get_metadata", &Video::getStreamMetadata)
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);
 

From 79e42bc401fabc6738aad6b03a536b69dbe4f0d4 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 24 Sep 2020 16:09:31 +0100
Subject: [PATCH 083/128] remove implicit calls to set a current stream (#2)

---
 torchvision/csrc/cpu/video/Video.cpp    | 63 +++++++++++++------------
 torchvision/csrc/cpu/video/Video.h      | 13 ++---
 torchvision/csrc/cpu/video/register.cpp |  1 +
 3 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 31f409009b5..675ba962e3c 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -194,8 +194,8 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
-  std::unordered_map<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
+  c10::Dict<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
+  c10::Dict<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
 
 
   // calback and metadata defined in struct
@@ -221,11 +221,14 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
       };
     }
   }
-  audioMetadata.insert({{"duration", audioDuration}, {"framerate", audioFPS}});
-  videoMetadata.insert({{"duration", videoDuration}, {"fps", videoFPS}});
-  streamsMetadata.insert({{"video", videoMetadata}, {"audio", audioMetadata}});
-
-  succeeded = Video::_setCurrentStream();
+  audioMetadata.insert("duration", audioDuration);
+  audioMetadata.insert("framerate", audioFPS);
+  videoMetadata.insert("duration", videoDuration);
+  videoMetadata.insert("fps", videoFPS);
+  streamsMetadata.insert("video", videoMetadata);
+  streamsMetadata.insert("audio", audioMetadata);
+
+  succeeded = Video::setCurrentStream();
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
   if (get<1>(current_stream) != -1) {
     LOG(INFO)
@@ -234,7 +237,12 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   }
 } // video
 
-bool Video::_setCurrentStream() {
+bool Video::setCurrentStream(std::string stream) {
+  
+  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
+    current_stream = _parseStream(stream);
+  }
+
   double ts = 0;
   if (seekTS > 0) {
     ts = seekTS;
@@ -250,6 +258,7 @@ bool Video::_setCurrentStream() {
   );
 
   // calback and metadata defined in Video.h
+  cout << "Decoder init at setStream " << succeeded << "\n" ;
   return (decoder.init(params, std::move(callback), &metadata));
 }
 
@@ -257,35 +266,27 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
   return current_stream;
 }
 
-std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
+c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
   return streamsMetadata;
 }
 
-void Video::Seek(double ts, bool any_frame = false) {
+void Video::Seek(double ts) {
   // initialize the class variables used for seeking and retrurn
-  video_any_frame = any_frame;
-  seekTS = ts;
-  doSeek = true;
-}
-
-std::tuple<torch::Tensor, double> Video::Next(std::string stream) {
-  bool newInit = false; // avoid unnecessary decoder initializations
-  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-    current_stream = _parseStream(stream);
-    newInit = true;
-  }
+  _getDecoderParams(
+      ts, // video start
+      0, // headerOnly
+      get<0>(current_stream), // stream
+      long(get<1>(
+          current_stream)), // stream_id parsed from info above change to -2
+      false // read all streams
+  );
 
-  if ((seekTS != -1) && (doSeek == true)) {
-    newInit = true;
-    doSeek = false;
-  }
+  // calback and metadata defined in Video.h
+  succeeded = decoder.init(params, std::move(callback), &metadata);
+  cout << "Decoder init at seek " << succeeded << "\n" ;
+}
 
-  if (newInit) {
-    succeeded = Video::_setCurrentStream();
-    if (succeeded) {
-      newInit = false;
-    }
-  }
+std::tuple<torch::Tensor, double> Video::Next() {
 
   // if failing to decode simply return a null tensor (note, should we
   // raise an exeption?)
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 5af73fe5330..fcbadb8ce19 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -22,16 +22,17 @@
 using namespace ffmpeg;
 
 struct Video : torch::CustomClassHolder {
-  std::tuple<std::string, long> current_stream; // streaam type, id
+  std::tuple<std::string, long> current_stream; // stream type, id
   // global video metadata
-  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
+  c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
 
  public:
   Video(std::string videoPath, std::string stream, bool isReadFile);
   std::tuple<std::string, int64_t> getCurrentStream() const;
-  std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
-  void Seek(double ts, bool any_frame);
-  std::tuple<torch::Tensor, double> Next(std::string stream);
+  c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
+  void Seek(double ts);
+  bool setCurrentStream(std::string stream = "video");
+  std::tuple<torch::Tensor, double> Next();
 
  private:
   bool video_any_frame = false; // add this to input parameters?
@@ -49,7 +50,7 @@ struct Video : torch::CustomClassHolder {
       long stream_id,
       bool all_streams,
       double seekFrameMarginUs); // this needs to be improved
-  bool _setCurrentStream();
+
   std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
   DecoderInCallback callback = nullptr;
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index 8f3f46072f5..b5d68bd68c0 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -9,6 +9,7 @@ static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, bool>())
         .def("get_current_stream", &Video::getCurrentStream)
+        .def("set_current_stream", &Video::setCurrentStream)
         .def("get_metadata", &Video::getStreamMetadata)
         .def("seek", &Video::Seek)
         .def("next", &Video::Next);

From fb32beb8af1afd678eb6218c4679b7874041abaa Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 25 Sep 2020 12:34:47 -0500
Subject: [PATCH 084/128] Adding new tests to check the accuracy of the seek

---
 test/test_video.py | 77 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 7 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 05e75e2f598..ebf0a2a8ab8 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -97,6 +97,39 @@
     # ),
 }
 
+def _template_read_video(video_object, s=0, e=None):
+    
+    if e is None:
+        e = float("inf")
+    if e < s:
+        raise ValueError(
+            "end time should be larger than start time, got "
+            "start time={} and end time={}".format(s, e)
+        )
+
+    video_object.set_current_stream("video")
+    video_object.seek(s)
+    video_frames = torch.empty(0)
+    frames = []
+    t, pts = video_object.next()
+    while t.numel() > 0 and (pts >= s and pts <= e):
+        frames.append(t)
+        t, pts = video_object.next()
+    if len(frames) > 0:
+        video_frames = torch.stack(frames, 0)
+
+    video_object.set_current_stream("audio")
+    video_object.seek(s)
+    audio_frames = torch.empty(0)
+    frames = []
+    t, pts = video_object.next()
+    while t.numel() > 0 and (pts > s and pts <= e):
+        frames.append(t)
+        t, pts = video_object.next()
+    if len(frames) > 0:
+        audio_frames = torch.stack(frames, 0)
+
+    return video_frames, audio_frames, video_object.get_metadata()
 
 @unittest.skipIf(_HAS_VIDEO_OPT is False, "Didn't compile with ffmpeg")
 class TestVideo(unittest.TestCase):
@@ -105,7 +138,6 @@ def test_read_video_tensor(self):
         Check if reading the video using the `next` based API yields the
         same sized and equal tensors as video_reader.
         """
-        print("test read")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
@@ -115,10 +147,10 @@ def test_read_video_tensor(self):
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video", True)
             frames = []
-            t, _ = reader.next("")
+            t, _ = reader.next()
             while t.numel() > 0:
                 frames.append(t)
-                t, _ = reader.next("")
+                t, _ = reader.next()
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
             self.assertEqual(torch.equal(tv_result, new_api), True)
@@ -127,7 +159,6 @@ def test_read_video_tensor(self):
     def test_pts(self):
         """Check if the frames have the same timestamps
         """
-        print("test timestamp")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
@@ -136,10 +167,10 @@ def test_pts(self):
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video", True)
             pts = []
-            t, p = reader.next("")
+            t, p = reader.next()
             while t.numel() > 0:
                 pts.append(p)
-                t, p = reader.next("")
+                t, p = reader.next()
             
             tv_timestamps = [float(p) for p in tv_timestamps]
             napi_pts = [float(p) for p in pts]
@@ -148,7 +179,6 @@ def test_pts(self):
 
     @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
     def test_metadata(self):
-        print("test fps")
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
@@ -156,6 +186,39 @@ def test_metadata(self):
             reader_md = reader.get_metadata()
             self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
             self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
+
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_video_reading_fn(self):
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            video, audio, metadata = _template_read_video(reader)
+            tv_video, tv_audio, info = torchvision.io.read_video(full_path, pts_unit="sec")
+
+            self.assertEqual(torch.equal(tv_video.permute(0, 3, 1, 2), video), True)
+            self.assertEqual(torch.equal(tv_audio, audio), True)
+    
+    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+    def test_partial_video_reading_fn(self):
+        import random
+        print("Test video reader")
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            # select two random points between 0 and duration
+            r = []
+            r.append(random.uniform(0, config.duration))
+            r.append(random.uniform(0, config.duration))
+            s = min(r)
+            e = max(r)
+
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            video, audio, metadata = _template_read_video(reader, s, e)
+            tv_video, tv_audio, info = torchvision.io.read_video(full_path, start_pts=s, end_pts=e, pts_unit="sec")
+            self.assertAlmostEqual(tv_video.size(0), video.size(0), delta=2.0)
       
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file

From 57b8776dfad1dbd66774dc25324d96ea2d0459b7 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 25 Sep 2020 12:36:13 -0500
Subject: [PATCH 085/128] cleanup debugging statements

---
 torchvision/csrc/cpu/video/Video.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 675ba962e3c..f40cbb2dfce 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -258,7 +258,6 @@ bool Video::setCurrentStream(std::string stream) {
   );
 
   // calback and metadata defined in Video.h
-  cout << "Decoder init at setStream " << succeeded << "\n" ;
   return (decoder.init(params, std::move(callback), &metadata));
 }
 
@@ -283,7 +282,7 @@ void Video::Seek(double ts) {
 
   // calback and metadata defined in Video.h
   succeeded = decoder.init(params, std::move(callback), &metadata);
-  cout << "Decoder init at seek " << succeeded << "\n" ;
+  LOG(INFO) << "Decoder init at seek " << succeeded << "\n" ;
 }
 
 std::tuple<torch::Tensor, double> Video::Next() {

From 921a747d1f79020525b037f48594fb0886cac982 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 29 Sep 2020 11:33:15 -0500
Subject: [PATCH 086/128] Addressing PR comments

---
 test/test_video.py                      | 82 ++++++++++++-------------
 torchvision/csrc/cpu/video/Video.cpp    | 30 +++++----
 torchvision/csrc/cpu/video/Video.h      | 16 +++--
 torchvision/csrc/cpu/video/register.cpp |  2 +-
 4 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index ebf0a2a8ab8..cd97b439c67 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -12,7 +12,6 @@
 from torchvision.io import _HAS_VIDEO_OPT
 
 
-
 VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
 
 CheckerConfig = [
@@ -70,35 +69,36 @@
         check_aframes=True,
         check_aframe_pts=True,
     ),
-    ### Last three test segfault on video reader (see issues)
-    # "R6llTwEh07w.mp4": GroundTruth(
-    #     duration=10.0,
-    #     video_fps=30.0,
-    #     audio_sample_rate=44100,
-    #     # PyAv miss one audio frame at the beginning (pts=0)
-    #     check_aframes=False,
-    #     check_aframe_pts=False,
-    # ),
-    # "SOX5yA1l24A.mp4": GroundTruth(
-    #     duration=11.0,
-    #     video_fps=29.97,
-    #     audio_sample_rate=48000,
-    #     # PyAv miss one audio frame at the beginning (pts=0)
-    #     check_aframes=False,
-    #     check_aframe_pts=False,
-    # ),
-    # "WUzgd7C1pWA.mp4": GroundTruth(
-    #     duration=11.0,
-    #     video_fps=29.97,
-    #     audio_sample_rate=48000,
-    #     # PyAv miss one audio frame at the beginning (pts=0)
-    #     check_aframes=False,
-    #     check_aframe_pts=False,
-    # ),
+    # Last three test segfault on video reader (see issues)
+    "R6llTwEh07w.mp4": GroundTruth(
+        duration=10.0,
+        video_fps=30.0,
+        audio_sample_rate=44100,
+        # PyAv miss one audio frame at the beginning (pts=0)
+        check_aframes=False,
+        check_aframe_pts=False,
+    ),
+    "SOX5yA1l24A.mp4": GroundTruth(
+        duration=11.0,
+        video_fps=29.97,
+        audio_sample_rate=48000,
+        # PyAv miss one audio frame at the beginning (pts=0)
+        check_aframes=False,
+        check_aframe_pts=False,
+    ),
+    "WUzgd7C1pWA.mp4": GroundTruth(
+        duration=11.0,
+        video_fps=29.97,
+        audio_sample_rate=48000,
+        # PyAv miss one audio frame at the beginning (pts=0)
+        check_aframes=False,
+        check_aframe_pts=False,
+    ),
 }
 
+
 def _template_read_video(video_object, s=0, e=None):
-    
+
     if e is None:
         e = float("inf")
     if e < s:
@@ -131,6 +131,7 @@ def _template_read_video(video_object, s=0, e=None):
 
     return video_frames, audio_frames, video_object.get_metadata()
 
+
 @unittest.skipIf(_HAS_VIDEO_OPT is False, "Didn't compile with ffmpeg")
 class TestVideo(unittest.TestCase):
     def test_read_video_tensor(self):
@@ -145,7 +146,7 @@ def test_read_video_tensor(self):
             tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
             tv_result = tv_result.permute(0, 3, 1, 2)
             # pass 2: decode all frames using new api
-            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            reader = torch.classes.torchvision.Video(full_path, "video")
             frames = []
             t, _ = reader.next()
             while t.numel() > 0:
@@ -154,8 +155,7 @@ def test_read_video_tensor(self):
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
             self.assertEqual(torch.equal(tv_result, new_api), True)
-    
-    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+
     def test_pts(self):
         """Check if the frames have the same timestamps
         """
@@ -163,44 +163,41 @@ def test_pts(self):
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
 
-            tv_timestamps, _ =  torchvision.io.read_video_timestamps(full_path, pts_unit='sec')
+            tv_timestamps, _ = torchvision.io.read_video_timestamps(full_path, pts_unit='sec')
             # pass 2: decode all frames using new api
-            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            reader = torch.classes.torchvision.Video(full_path, "video")
             pts = []
             t, p = reader.next()
             while t.numel() > 0:
                 pts.append(p)
                 t, p = reader.next()
-            
+
             tv_timestamps = [float(p) for p in tv_timestamps]
             napi_pts = [float(p) for p in pts]
             for i in range(len(napi_pts)):
                 self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
 
-    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
     def test_metadata(self):
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
-            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            reader = torch.classes.torchvision.Video(full_path, "video")
             reader_md = reader.get_metadata()
             self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
             self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
 
-    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
     def test_video_reading_fn(self):
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
 
-            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            reader = torch.classes.torchvision.Video(full_path, "video")
             video, audio, metadata = _template_read_video(reader)
             tv_video, tv_audio, info = torchvision.io.read_video(full_path, pts_unit="sec")
 
             self.assertEqual(torch.equal(tv_video.permute(0, 3, 1, 2), video), True)
             self.assertEqual(torch.equal(tv_audio, audio), True)
-    
-    @unittest.skipIf(not _HAS_VIDEO_OPT, "video_reader backend is not chosen")
+
     def test_partial_video_reading_fn(self):
         import random
         print("Test video reader")
@@ -215,10 +212,11 @@ def test_partial_video_reading_fn(self):
             s = min(r)
             e = max(r)
 
-            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            reader = torch.classes.torchvision.Video(full_path, "video")
             video, audio, metadata = _template_read_video(reader, s, e)
             tv_video, tv_audio, info = torchvision.io.read_video(full_path, start_pts=s, end_pts=e, pts_unit="sec")
             self.assertAlmostEqual(tv_video.size(0), video.size(0), delta=2.0)
-      
+
+
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index bea0f16e676..820a71a4c24 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -49,11 +49,8 @@ size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
   return fillTensorList<float>(msgs, audioFrame);
 }
 
-std::pair<
-    std::__cxx11::
-        basic_string<char, std::char_traits<char>, std::allocator<char>>,
-    ffmpeg::MediaType> const*
-_parse_type(const std::string& stream_string) {
+std::pair<std::string, ffmpeg::MediaType> const* _parse_type(
+    const std::string& stream_string) {
   static const std::array<std::pair<std::string, MediaType>, 4> types = {{
       {"video", TYPE_VIDEO},
       {"audio", TYPE_AUDIO},
@@ -169,7 +166,7 @@ void Video::_getDecoderParams(
 
 } // _get decoder params
 
-Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
+Video::Video(std::string videoPath, std::string stream) {
   // parse stream information
   current_stream = _parseStream(stream);
   // note that in the initial call we want to get all streams
@@ -192,16 +189,16 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  c10::Dict<std::string, std::vector<double, std::allocator<double>>>  audioMetadata;
-  c10::Dict<std::string, std::vector<double, std::allocator<double>>>  videoMetadata;
-
+  c10::Dict<std::string, std::vector<double, std::allocator<double>>>
+      audioMetadata;
+  c10::Dict<std::string, std::vector<double, std::allocator<double>>>
+      videoMetadata;
 
   // calback and metadata defined in struct
   succeeded = decoder.init(params, std::move(callback), &metadata);
   if (succeeded) {
     for (const auto& header : metadata) {
       double fps = double(header.fps);
-      double timeBase = double(header.num) / double(header.den);
       double duration = double(header.duration) * 1e-6; // * timeBase;
 
       if (header.format.type == TYPE_VIDEO) {
@@ -226,7 +223,7 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   streamsMetadata.insert("video", videoMetadata);
   streamsMetadata.insert("audio", audioMetadata);
 
-  succeeded = Video::setCurrentStream();
+  succeeded = Video::setCurrentStream(stream);
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
   if (get<1>(current_stream) != -1) {
     LOG(INFO)
@@ -235,8 +232,7 @@ Video::Video(std::string videoPath, std::string stream, bool isReadFile) {
   }
 } // video
 
-bool Video::setCurrentStream(std::string stream) {
-  
+bool Video::setCurrentStream(std::string stream = "video") {
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
     current_stream = _parseStream(stream);
   }
@@ -263,7 +259,10 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
   return current_stream;
 }
 
-c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> Video::getStreamMetadata() const {
+c10::Dict<
+    std::string,
+    c10::Dict<std::string, std::vector<double, std::allocator<double>>>>
+Video::getStreamMetadata() const {
   return streamsMetadata;
 }
 
@@ -280,11 +279,10 @@ void Video::Seek(double ts) {
 
   // calback and metadata defined in Video.h
   succeeded = decoder.init(params, std::move(callback), &metadata);
-  LOG(INFO) << "Decoder init at seek " << succeeded << "\n" ;
+  LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
 }
 
 std::tuple<torch::Tensor, double> Video::Next() {
-
   // if failing to decode simply return a null tensor (note, should we
   // raise an exeption?)
   double frame_pts_s;
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index fcbadb8ce19..716065e1236 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -13,7 +13,6 @@
 #include <c10/util/Logging.h>
 #include <torch/script.h>
 
-
 #include <exception>
 #include "defs.h"
 #include "memory_buffer.h"
@@ -24,14 +23,20 @@ using namespace ffmpeg;
 struct Video : torch::CustomClassHolder {
   std::tuple<std::string, long> current_stream; // stream type, id
   // global video metadata
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> streamsMetadata;
+  c10::Dict<
+      std::string,
+      c10::Dict<std::string, std::vector<double, std::allocator<double>>>>
+      streamsMetadata;
 
  public:
-  Video(std::string videoPath, std::string stream, bool isReadFile);
+  Video(std::string videoPath, std::string stream);
   std::tuple<std::string, int64_t> getCurrentStream() const;
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double, std::allocator<double>>>> getStreamMetadata() const;
+  c10::Dict<
+      std::string,
+      c10::Dict<std::string, std::vector<double, std::allocator<double>>>>
+  getStreamMetadata() const;
   void Seek(double ts);
-  bool setCurrentStream(std::string stream = "video");
+  bool setCurrentStream(std::string stream);
   std::tuple<torch::Tensor, double> Next();
 
  private:
@@ -54,7 +59,6 @@ struct Video : torch::CustomClassHolder {
   std::map<std::string, std::vector<double>> streamTimeBase; // not used
 
   DecoderInCallback callback = nullptr;
-  ;
   std::vector<DecoderMetadata> metadata;
 
  protected:
diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp
index b5d68bd68c0..a88615987bf 100644
--- a/torchvision/csrc/cpu/video/register.cpp
+++ b/torchvision/csrc/cpu/video/register.cpp
@@ -7,7 +7,7 @@ namespace {
 
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, bool>())
+        .def(torch::init<std::string, std::string>())
         .def("get_current_stream", &Video::getCurrentStream)
         .def("set_current_stream", &Video::setCurrentStream)
         .def("get_metadata", &Video::getStreamMetadata)

From 54f5416a25c1fa28594181426219a585d9fd4406 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 30 Sep 2020 04:22:11 -0500
Subject: [PATCH 087/128] addressing Francisco's comments

---
 torchvision/csrc/cpu/video/Video.cpp | 6 +++---
 torchvision/csrc/cpu/video/Video.h   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index 820a71a4c24..d70c1527757 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -189,9 +189,9 @@ Video::Video(std::string videoPath, std::string stream) {
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  c10::Dict<std::string, std::vector<double, std::allocator<double>>>
+  c10::Dict<std::string, std::vector<double>>
       audioMetadata;
-  c10::Dict<std::string, std::vector<double, std::allocator<double>>>
+  c10::Dict<std::string, std::vector<double>>
       videoMetadata;
 
   // calback and metadata defined in struct
@@ -261,7 +261,7 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
 
 c10::Dict<
     std::string,
-    c10::Dict<std::string, std::vector<double, std::allocator<double>>>>
+    c10::Dict<std::string, std::vector<double>>>
 Video::getStreamMetadata() const {
   return streamsMetadata;
 }
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 716065e1236..76fd1e64828 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -25,7 +25,7 @@ struct Video : torch::CustomClassHolder {
   // global video metadata
   c10::Dict<
       std::string,
-      c10::Dict<std::string, std::vector<double, std::allocator<double>>>>
+      c10::Dict<std::string, std::vector<double>>>
       streamsMetadata;
 
  public:
@@ -33,7 +33,7 @@ struct Video : torch::CustomClassHolder {
   std::tuple<std::string, int64_t> getCurrentStream() const;
   c10::Dict<
       std::string,
-      c10::Dict<std::string, std::vector<double, std::allocator<double>>>>
+      c10::Dict<std::string, std::vector<double>>>
   getStreamMetadata() const;
   void Seek(double ts);
   bool setCurrentStream(std::string stream);

From 88e2741cec29a2b50f5c322c434c357bccac5f00 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 30 Sep 2020 05:08:05 -0500
Subject: [PATCH 088/128] CLANG build formatting

---
 torchvision/csrc/cpu/video/Video.cpp | 12 ++++--------
 torchvision/csrc/cpu/video/Video.h   |  8 ++------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index d70c1527757..f90c2c04871 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -189,10 +189,8 @@ Video::Video(std::string videoPath, std::string stream) {
   std::vector<double> audioFPS, videoFPS, ccFPS, subsFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
   std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  c10::Dict<std::string, std::vector<double>>
-      audioMetadata;
-  c10::Dict<std::string, std::vector<double>>
-      videoMetadata;
+  c10::Dict<std::string, std::vector<double>> audioMetadata;
+  c10::Dict<std::string, std::vector<double>> videoMetadata;
 
   // calback and metadata defined in struct
   succeeded = decoder.init(params, std::move(callback), &metadata);
@@ -259,10 +257,8 @@ std::tuple<std::string, int64_t> Video::getCurrentStream() const {
   return current_stream;
 }
 
-c10::Dict<
-    std::string,
-    c10::Dict<std::string, std::vector<double>>>
-Video::getStreamMetadata() const {
+c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
+    getStreamMetadata() const {
   return streamsMetadata;
 }
 
diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 76fd1e64828..453ba61f78a 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -23,17 +23,13 @@ using namespace ffmpeg;
 struct Video : torch::CustomClassHolder {
   std::tuple<std::string, long> current_stream; // stream type, id
   // global video metadata
-  c10::Dict<
-      std::string,
-      c10::Dict<std::string, std::vector<double>>>
+  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
       streamsMetadata;
 
  public:
   Video(std::string videoPath, std::string stream);
   std::tuple<std::string, int64_t> getCurrentStream() const;
-  c10::Dict<
-      std::string,
-      c10::Dict<std::string, std::vector<double>>>
+  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
   getStreamMetadata() const;
   void Seek(double ts);
   bool setCurrentStream(std::string stream);

From 703c12b4e6a6f1931b1455d7dc0b5b0fcb1cff48 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 05:56:51 -0500
Subject: [PATCH 089/128] Updated testing to test against pyav for the video
 tensor reads

---
 test/test_video.py | 201 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 173 insertions(+), 28 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index cd97b439c67..2cce97a4ed7 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -96,6 +96,125 @@
     ),
 }
 
+DecoderResult = collections.namedtuple(
+    "DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase"
+)
+
+def _read_from_stream(
+    container, start_pts, end_pts, stream, stream_name, buffer_size=4
+):
+    """
+    Args:
+        container: pyav container
+        start_pts/end_pts: the starting/ending Presentation TimeStamp where
+            frames are read
+        stream: pyav stream
+        stream_name: a dictionary of streams. For example, {"video": 0} means
+            video stream at stream index 0
+        buffer_size: pts of frames decoded by PyAv is not guaranteed to be in
+            ascending order. We need to decode more frames even when we meet end
+            pts
+    """
+    # seeking in the stream is imprecise. Thus, seek to an ealier PTS by a margin
+    margin = 1
+    seek_offset = max(start_pts - margin, 0)
+
+    container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
+    frames = {}
+    buffer_count = 0
+    for frame in container.decode(**stream_name):
+        if frame.pts < start_pts:
+            continue
+        if frame.pts <= end_pts:
+            frames[frame.pts] = frame
+        else:
+            buffer_count += 1
+            if buffer_count >= buffer_size:
+                break
+    result = [frames[pts] for pts in sorted(frames)]
+
+    return result
+
+def _fraction_to_tensor(fraction):
+    ret = torch.zeros([2], dtype=torch.int32)
+    ret[0] = fraction.numerator
+    ret[1] = fraction.denominator
+    return ret
+
+def _decode_frames_by_av_module(
+    full_path,
+    video_start_pts=0,
+    video_end_pts=None,
+    audio_start_pts=0,
+    audio_end_pts=None,
+):
+    """
+    Use PyAv to decode video frames. This provides a reference for our decoder
+    to compare the decoding results.
+    Input arguments:
+        full_path: video file path
+        video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where
+            frames are read
+    """
+    import av
+    if video_end_pts is None:
+        video_end_pts = float("inf")
+    if audio_end_pts is None:
+        audio_end_pts = float("inf")
+    container = av.open(full_path)
+
+    video_frames = []
+    vtimebase = torch.zeros([0], dtype=torch.int32)
+    if container.streams.video:
+        video_frames = _read_from_stream(
+            container,
+            video_start_pts,
+            video_end_pts,
+            container.streams.video[0],
+            {"video": 0},
+        )
+        # container.streams.video[0].average_rate is not a reliable estimator of
+        # frame rate. It can be wrong for certain codec, such as VP80
+        # So we do not return video fps here
+        vtimebase = _fraction_to_tensor(container.streams.video[0].time_base)
+
+    audio_frames = []
+    atimebase = torch.zeros([0], dtype=torch.int32)
+    if container.streams.audio:
+        audio_frames = _read_from_stream(
+            container,
+            audio_start_pts,
+            audio_end_pts,
+            container.streams.audio[0],
+            {"audio": 0},
+        )
+        atimebase = _fraction_to_tensor(container.streams.audio[0].time_base)
+
+    container.close()
+    vframes = [frame.to_rgb().to_ndarray() for frame in video_frames]
+    vframes = torch.as_tensor(np.stack(vframes))
+
+    vframe_pts = torch.tensor([frame.pts for frame in video_frames], dtype=torch.int64)
+
+    aframes = [frame.to_ndarray() for frame in audio_frames]
+    if aframes:
+        aframes = np.transpose(np.concatenate(aframes, axis=1))
+        aframes = torch.as_tensor(aframes)
+    else:
+        aframes = torch.empty((1, 0), dtype=torch.float32)
+
+    aframe_pts = torch.tensor(
+        [audio_frame.pts for audio_frame in audio_frames], dtype=torch.int64
+    )
+
+    return DecoderResult(
+        vframes=vframes.permute(0, 3, 1, 2),
+        vframe_pts=vframe_pts,
+        vtimebase=vtimebase,
+        aframes=aframes,
+        aframe_pts=aframe_pts,
+        atimebase=atimebase,
+    )
 
 def _template_read_video(video_object, s=0, e=None):
 
@@ -106,14 +225,15 @@ def _template_read_video(video_object, s=0, e=None):
             "end time should be larger than start time, got "
             "start time={} and end time={}".format(s, e)
         )
-
     video_object.set_current_stream("video")
     video_object.seek(s)
     video_frames = torch.empty(0)
     frames = []
+    video_pts = []
     t, pts = video_object.next()
     while t.numel() > 0 and (pts >= s and pts <= e):
         frames.append(t)
+        video_pts.append(pts)
         t, pts = video_object.next()
     if len(frames) > 0:
         video_frames = torch.stack(frames, 0)
@@ -122,13 +242,23 @@ def _template_read_video(video_object, s=0, e=None):
     video_object.seek(s)
     audio_frames = torch.empty(0)
     frames = []
+    audio_pts = []
     t, pts = video_object.next()
     while t.numel() > 0 and (pts > s and pts <= e):
         frames.append(t)
+        audio_pts.append(pts)
         t, pts = video_object.next()
     if len(frames) > 0:
         audio_frames = torch.stack(frames, 0)
 
+    return DecoderResult(
+        vframes=video_frames,
+        vframe_pts=video_pts,
+        vtimebase=None,
+        aframes=audio_frames,
+        aframe_pts=audio_pts,
+        atimebase=None,
+    )
     return video_frames, audio_frames, video_object.get_metadata()
 
 
@@ -137,9 +267,9 @@ class TestVideo(unittest.TestCase):
     def test_read_video_tensor(self):
         """
         Check if reading the video using the `next` based API yields the
-        same sized and equal tensors as video_reader.
+        same sized tensors as the pyav alternative.
         """
-        torchvision.set_video_backend("video_reader")
+        torchvision.set_video_backend("pyav")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
             # pass 1: decode all frames using existing TV decoder
@@ -154,10 +284,10 @@ def test_read_video_tensor(self):
                 t, _ = reader.next()
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
-            self.assertEqual(torch.equal(tv_result, new_api), True)
 
     def test_pts(self):
-        """Check if the frames have the same timestamps
+        """
+        Check if every frame read from 
         """
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
@@ -176,9 +306,17 @@ def test_pts(self):
             napi_pts = [float(p) for p in pts]
             for i in range(len(napi_pts)):
                 self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
+         # check if pts of video frames are sorted in ascending order
+        for i in range(len(napi_pts) - 1):
+            self.assertEqual(napi_pts[i] < napi_pts[i + 1], True)
+
 
     def test_metadata(self):
-        torchvision.set_video_backend("video_reader")
+        """
+        Test that the metadata returned via pyav corresponds to the one returned
+        by the new video decoder API
+        """
+        torchvision.set_video_backend("pyav")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
             reader = torch.classes.torchvision.Video(full_path, "video")
@@ -186,36 +324,43 @@ def test_metadata(self):
             self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
             self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
 
+    
     def test_video_reading_fn(self):
-        torchvision.set_video_backend("video_reader")
+        """
+        Test that the outputs of the pyav and ffmpeg outputs are mostly the same
+        """        
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
 
+            ref_result = _decode_frames_by_av_module(full_path)
+
             reader = torch.classes.torchvision.Video(full_path, "video")
-            video, audio, metadata = _template_read_video(reader)
-            tv_video, tv_audio, info = torchvision.io.read_video(full_path, pts_unit="sec")
+            newapi_result = _template_read_video(reader)
+            
+            
+            # First we check if the frames are approximately the same
+            # (note that every codec context has signature artefacts which
+            # make a direct comparison not feasible)
+            if newapi_result.vframes.numel() > 0 and ref_result.vframes.numel() > 0:
+                mean_delta = torch.mean(
+                torch.abs(newapi_result.vframes.float() - ref_result.vframes.float())
+            )
+            self.assertAlmostEqual(mean_delta, 0, delta=8.0)
 
-            self.assertEqual(torch.equal(tv_video.permute(0, 3, 1, 2), video), True)
-            self.assertEqual(torch.equal(tv_audio, audio), True)
+            # Just a sanity check: are the two of the correct size? 
+            self.assertEqual(newapi_result.vframes.size(), ref_result.vframes.size())
 
-    def test_partial_video_reading_fn(self):
-        import random
-        print("Test video reader")
-        torchvision.set_video_backend("video_reader")
-        for test_video, config in test_videos.items():
-            full_path = os.path.join(VIDEO_DIR, test_video)
-
-            # select two random points between 0 and duration
-            r = []
-            r.append(random.uniform(0, config.duration))
-            r.append(random.uniform(0, config.duration))
-            s = min(r)
-            e = max(r)
 
-            reader = torch.classes.torchvision.Video(full_path, "video")
-            video, audio, metadata = _template_read_video(reader, s, e)
-            tv_video, tv_audio, info = torchvision.io.read_video(full_path, start_pts=s, end_pts=e, pts_unit="sec")
-            self.assertAlmostEqual(tv_video.size(0), video.size(0), delta=2.0)
+            # Lastly, we compare the resulting audio streams
+            if (
+                config.check_aframes
+                and newapi_result.aframes.numel() > 0
+                and ref_result.aframes.numel() > 0
+            ):
+                """Audio stream is available and audio frame is required to return
+                from decoder"""
+                is_same = torch.all(torch.eq(newapi_result.aframes, ref_result.aframes)).item()
+                self.assertEqual(is_same, True)
 
 
 if __name__ == '__main__':

From ca9cc251c46f1a6554a596883b9a25182bc5d24e Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 06:03:14 -0500
Subject: [PATCH 090/128] Formatting

---
 test/test_video.py                   | 18 +++++++++---------
 torchvision/csrc/cpu/video/Video.cpp | 14 ++++----------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 2cce97a4ed7..dff2530a4b9 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -100,6 +100,7 @@
     "DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase"
 )
 
+
 def _read_from_stream(
     container, start_pts, end_pts, stream, stream_name, buffer_size=4
 ):
@@ -135,12 +136,14 @@ def _read_from_stream(
 
     return result
 
+
 def _fraction_to_tensor(fraction):
     ret = torch.zeros([2], dtype=torch.int32)
     ret[0] = fraction.numerator
     ret[1] = fraction.denominator
     return ret
 
+
 def _decode_frames_by_av_module(
     full_path,
     video_start_pts=0,
@@ -216,6 +219,7 @@ def _decode_frames_by_av_module(
         atimebase=atimebase,
     )
 
+
 def _template_read_video(video_object, s=0, e=None):
 
     if e is None:
@@ -310,7 +314,6 @@ def test_pts(self):
         for i in range(len(napi_pts) - 1):
             self.assertEqual(napi_pts[i] < napi_pts[i + 1], True)
 
-
     def test_metadata(self):
         """
         Test that the metadata returned via pyav corresponds to the one returned
@@ -324,11 +327,10 @@ def test_metadata(self):
             self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
             self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
 
-    
     def test_video_reading_fn(self):
         """
         Test that the outputs of the pyav and ffmpeg outputs are mostly the same
-        """        
+        """
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
 
@@ -336,21 +338,19 @@ def test_video_reading_fn(self):
 
             reader = torch.classes.torchvision.Video(full_path, "video")
             newapi_result = _template_read_video(reader)
-            
-            
+
             # First we check if the frames are approximately the same
             # (note that every codec context has signature artefacts which
             # make a direct comparison not feasible)
             if newapi_result.vframes.numel() > 0 and ref_result.vframes.numel() > 0:
                 mean_delta = torch.mean(
-                torch.abs(newapi_result.vframes.float() - ref_result.vframes.float())
-            )
+                    torch.abs(newapi_result.vframes.float() - ref_result.vframes.float())
+                )
             self.assertAlmostEqual(mean_delta, 0, delta=8.0)
 
-            # Just a sanity check: are the two of the correct size? 
+            # Just a sanity check: are the two of the correct size?
             self.assertEqual(newapi_result.vframes.size(), ref_result.vframes.size())
 
-
             # Lastly, we compare the resulting audio streams
             if (
                 config.check_aframes
diff --git a/torchvision/csrc/cpu/video/Video.cpp b/torchvision/csrc/cpu/video/Video.cpp
index f90c2c04871..f3c55fd6dea 100644
--- a/torchvision/csrc/cpu/video/Video.cpp
+++ b/torchvision/csrc/cpu/video/Video.cpp
@@ -135,6 +135,7 @@ void Video::_getDecoderParams(
     format.format.video.width = 0;
     format.format.video.height = 0;
     format.format.video.cropImage = 0;
+    format.format.video.format = defaultVideoPixelFormat;
     params.formats.insert(format);
 
     format.type = TYPE_SUBTITLE;
@@ -325,19 +326,12 @@ std::tuple<torch::Tensor, double> Video::Next() {
     // currently not supporting other formats (will do soon)
 
     out.payload.reset();
+  } else if (res == 61) {
+    LOG(INFO) << "Decoder ran out of frames (error 61)\n";
   } else {
-    LOG(ERROR) << "Decoder failed ( or ran into last iteration)";
+    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
   }
 
   std::tuple<torch::Tensor, double> result = {outFrame, frame_pts_s};
   return result;
 }
-
-// Video::~Video() {
-// destructor to be defined thoroughly later
-//   delete params; // does not have destructor
-//   delete metadata; // struct does not have destructor
-//   delete decoder; // should be fine
-//   delete streamFPS; // should be fine
-//   delete streamDuration; // should be fine
-// }

From 8991a5c7ec12d2b4eef2bad387ca9051bbd822cf Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 06:16:11 -0500
Subject: [PATCH 091/128] remove pyav from pip deps and add it to conda build

---
 .circleci/unittest/linux/scripts/environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 96b66319ed6..b9c9fb2be4b 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,4 +13,3 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
-    - av
\ No newline at end of file

From f8ce24a24d514475e58e49055acf4efe687104c6 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 06:16:29 -0500
Subject: [PATCH 092/128] add pyav and ffmeped to conda builds

---
 .circleci/unittest/linux/scripts/setup_env.sh | 4 ++++
 .travis.yml                                   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 44ee98b91d0..d50a195c367 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -37,3 +37,7 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
+
+# install pyav and ffmpeg (Hacky)
+conda install -y -c conda-forge ffmpeg=4.2
+conda install av -c conda-forge -y
diff --git a/.travis.yml b/.travis.yml
index 44fd9ae3c16..23127f2b1cd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,6 +36,7 @@ before_install:
       pip install -q --user typing-extensions==3.6.6
       pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122
     fi
+  - conda install -c conda-forge ffmpeg=4.2
   - conda install av -c conda-forge
 
 

From dfa26968e7d5d4b7a5b1baa8e5267cc3e5346233 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 06:27:09 -0500
Subject: [PATCH 093/128] Formatting?

---
 .circleci/unittest/linux/scripts/environment.yml | 6 +++---
 .travis.yml                                      | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index b9c9fb2be4b..c201dd1ae9f 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -10,6 +10,6 @@ dependencies:
   - jpeg
   - ca-certificates
   - pip:
-    - future
-    - pillow>=4.1.1
-    - scipy
+      - future
+      - pillow>=4.1.1
+      - scipy
diff --git a/.travis.yml b/.travis.yml
index 23127f2b1cd..d7c4c4f0f28 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,7 +39,6 @@ before_install:
   - conda install -c conda-forge ffmpeg=4.2
   - conda install av -c conda-forge
 
-
 install:
   # Using pip instead of setup.py ensures we install a non-compressed version of the package
   # (as opposed to an egg), which is necessary to collect coverage.

From d85eab190e241e821c18c3401c9f516adda2b08a Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 06:42:33 -0500
Subject: [PATCH 094/128] Setting up linter once and for all hopefully

---
 test/test_video.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index dff2530a4b9..30e0b96ec3a 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -160,6 +160,7 @@ def _decode_frames_by_av_module(
             frames are read
     """
     import av
+
     if video_end_pts is None:
         video_end_pts = float("inf")
     if audio_end_pts is None:
@@ -291,13 +292,15 @@ def test_read_video_tensor(self):
 
     def test_pts(self):
         """
-        Check if every frame read from 
+        Check if every frame read from
         """
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
 
-            tv_timestamps, _ = torchvision.io.read_video_timestamps(full_path, pts_unit='sec')
+            tv_timestamps, _ = torchvision.io.read_video_timestamps(
+                full_path, pts_unit="sec"
+            )
             # pass 2: decode all frames using new api
             reader = torch.classes.torchvision.Video(full_path, "video")
             pts = []
@@ -310,7 +313,7 @@ def test_pts(self):
             napi_pts = [float(p) for p in pts]
             for i in range(len(napi_pts)):
                 self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
-         # check if pts of video frames are sorted in ascending order
+        # check if pts of video frames are sorted in ascending order
         for i in range(len(napi_pts) - 1):
             self.assertEqual(napi_pts[i] < napi_pts[i + 1], True)
 
@@ -324,8 +327,12 @@ def test_metadata(self):
             full_path = os.path.join(VIDEO_DIR, test_video)
             reader = torch.classes.torchvision.Video(full_path, "video")
             reader_md = reader.get_metadata()
-            self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001)
-            self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
+            self.assertAlmostEqual(
+                config.video_fps, reader_md["video"]["fps"][0], delta=0.0001
+            )
+            self.assertAlmostEqual(
+                config.duration, reader_md["video"]["duration"][0], delta=0.5
+            )
 
     def test_video_reading_fn(self):
         """
@@ -344,7 +351,9 @@ def test_video_reading_fn(self):
             # make a direct comparison not feasible)
             if newapi_result.vframes.numel() > 0 and ref_result.vframes.numel() > 0:
                 mean_delta = torch.mean(
-                    torch.abs(newapi_result.vframes.float() - ref_result.vframes.float())
+                    torch.abs(
+                        newapi_result.vframes.float() - ref_result.vframes.float()
+                    )
                 )
             self.assertAlmostEqual(mean_delta, 0, delta=8.0)
 
@@ -359,9 +368,11 @@ def test_video_reading_fn(self):
             ):
                 """Audio stream is available and audio frame is required to return
                 from decoder"""
-                is_same = torch.all(torch.eq(newapi_result.aframes, ref_result.aframes)).item()
+                is_same = torch.all(
+                    torch.eq(newapi_result.aframes, ref_result.aframes)
+                ).item()
                 self.assertEqual(is_same, True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From a33b5a4d1a8b11a646e32924ce81f8bfac249191 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 17:13:11 +0200
Subject: [PATCH 095/128] Testing pyav

---
 .circleci/unittest/linux/scripts/environment.yml | 1 -
 .circleci/unittest/linux/scripts/setup_env.sh    | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 96b66319ed6..b9c9fb2be4b 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,4 +13,3 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
-    - av
\ No newline at end of file
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 44ee98b91d0..8841d89ed34 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -37,3 +37,5 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
+
+conda install av -c conda-forge -y

From 1a6c9b2d322ec02b4add47d5fbc0cb67b3c5b22d Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 17:47:51 +0200
Subject: [PATCH 096/128] Fix to 8.0.0

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 8841d89ed34..d58554df84e 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -38,4 +38,5 @@ conda activate "${env_dir}"
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
-conda install av -c conda-forge -y
+conda install av=8.0.0 -c conda-forge -y
+conda list

From 6701c76081cfc5852ad80f34c1035591ceb1f202 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 17:59:40 +0200
Subject: [PATCH 097/128] Try 6.2.0

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index d58554df84e..05249652e49 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -38,5 +38,6 @@ conda activate "${env_dir}"
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
-conda install av=8.0.0 -c conda-forge -y
+conda list
+conda install av=6.2.0 -c conda-forge -y
 conda list

From e9feda262a3cbad9fc6ba93e7d1d0dff700384ad Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:19:07 +0200
Subject: [PATCH 098/128] See what happens with av from pip

---
 .circleci/unittest/linux/scripts/setup_env.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 05249652e49..8a14025d376 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -39,5 +39,5 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-conda install av=6.2.0 -c conda-forge -y
-conda list
+#conda install av=6.2.0 -c conda-forge -y
+#conda list

From b5fad6a3d867902c6a1e8e2157b8084bc426dec0 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 11:27:41 -0500
Subject: [PATCH 099/128] Remove FFMPEG blocker

---
 .circleci/unittest/linux/scripts/setup_env.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index d50a195c367..44ee98b91d0 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -37,7 +37,3 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
-
-# install pyav and ffmpeg (Hacky)
-conda install -y -c conda-forge ffmpeg=4.2
-conda install av -c conda-forge -y

From f983d10f3986a8f77bcfd944d0e99e09eae75c3e Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:32:44 +0200
Subject: [PATCH 100/128] What is going on?

---
 .circleci/unittest/linux/scripts/environment.yml | 1 +
 .circleci/unittest/linux/scripts/setup_env.sh    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index b9c9fb2be4b..9b56b717f46 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,3 +13,4 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
+    - av
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 8a14025d376..90c7d6206b2 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -34,6 +34,7 @@ if [ ! -d "${env_dir}" ]; then
 fi
 conda activate "${env_dir}"
 
+conda list
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune

From e4f89743c01d284145d0afb62939a6d142f293da Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:49:26 +0200
Subject: [PATCH 101/128] More tests

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 90c7d6206b2..4043ee74440 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -41,4 +41,5 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
 #conda install av=6.2.0 -c conda-forge -y
-#conda list
+conda install ffmpeg=4.2.2 -c conda-forge -y
+conda list

From 77822d1683d94bff9d3271af9449aa8dea38a2de Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:49:51 +0200
Subject: [PATCH 102/128] Forgot something

---
 .circleci/unittest/linux/scripts/environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 9b56b717f46..b9c9fb2be4b 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,4 +13,3 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
-    - av

From e884adf08f260edd48f2dea15fc8b2377f5ed2e0 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 2 Oct 2020 12:01:19 -0500
Subject: [PATCH 103/128] unblocker

---
 .circleci/unittest/linux/scripts/setup_env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 44ee98b91d0..8cdd176e579 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -37,3 +37,5 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
+
+conda install ffmpeg=4.2.2 -c conda-forge -y

From 8aaa9060a3e033e1033cde575d1251f0ab30c095 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 3 Oct 2020 18:44:44 +0200
Subject: [PATCH 104/128] Check if cache is messing up with things

---
 .circleci/config.yml                          | 24 +++++++++----------
 .circleci/config.yml.in                       | 24 +++++++++----------
 .circleci/unittest/linux/scripts/setup_env.sh |  4 ++--
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0f44ccdde06..cf2471ba608 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -408,21 +408,21 @@ jobs:
           name: Generate cache key
           # This will refresh cache on Sundays, nightly build should generate new cache.
           command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
+          #- restore_cache:
+          #
+          #keys:
+          #  - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #
       - run:
           name: Setup
           command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
+          #- save_cache:
+          #
+          #key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #
+          #paths:
+          #  - conda
+          #  - env
       - run:
           name: Install torchvision
           command: .circleci/unittest/linux/scripts/install.sh
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index fc0539bc682..ae574294f53 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -408,21 +408,21 @@ jobs:
           name: Generate cache key
           # This will refresh cache on Sundays, nightly build should generate new cache.
           command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
+          #- restore_cache:
+          #{% raw %}
+          #keys:
+          #  - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #{% endraw %}
       - run:
           name: Setup
           command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
+          #- save_cache:
+          #{% raw %}
+          #key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #{% endraw %}
+          #paths:
+          #  - conda
+          #  - env
       - run:
           name: Install torchvision
           command: .circleci/unittest/linux/scripts/install.sh
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 4043ee74440..bd15aa1bb82 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -40,6 +40,6 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-#conda install av=6.2.0 -c conda-forge -y
-conda install ffmpeg=4.2.2 -c conda-forge -y
+conda install av -c conda-forge -y
+#conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From 7e2d0ead9dd864731fe8ac593852dff6ad06b9ff Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 3 Oct 2020 18:57:20 +0200
Subject: [PATCH 105/128] Now try with different ffmpeg

---
 .circleci/unittest/linux/scripts/setup_env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index bd15aa1bb82..f23de822933 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -40,6 +40,6 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-conda install av -c conda-forge -y
+conda install av ffmpeg=4.2.2 -c conda-forge -y
 #conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From ffa47b966999bbd3bf1fa755d1af49ddec1794b5 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 3 Oct 2020 19:11:30 +0200
Subject: [PATCH 106/128] Now try with different ffmpeg

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index f23de822933..244519fd6a5 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -40,6 +40,7 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-conda install av ffmpeg=4.2.2 -c conda-forge -y
+#conda install av=6.2.0 -c conda-forge -y
+conda install av ffmpeg=4.0.2 -c conda-forge -y
 #conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From d47c58984246fa0661af6841c1c6f538036a424b Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 17:13:11 +0200
Subject: [PATCH 107/128] Testing pyav

---
 .circleci/unittest/linux/scripts/environment.yml | 1 -
 .circleci/unittest/linux/scripts/setup_env.sh    | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 96b66319ed6..b9c9fb2be4b 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,4 +13,3 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
-    - av
\ No newline at end of file
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 44ee98b91d0..8841d89ed34 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -37,3 +37,5 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
+
+conda install av -c conda-forge -y

From 5ee2fc14937e8f1f8315a258862a6efeefabda15 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 17:47:51 +0200
Subject: [PATCH 108/128] Fix to 8.0.0

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 8841d89ed34..d58554df84e 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -38,4 +38,5 @@ conda activate "${env_dir}"
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
-conda install av -c conda-forge -y
+conda install av=8.0.0 -c conda-forge -y
+conda list

From 1f6a3a3281ef512862ab708ebe4737d6868d15d1 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 17:59:40 +0200
Subject: [PATCH 109/128] Try 6.2.0

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index d58554df84e..05249652e49 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -38,5 +38,6 @@ conda activate "${env_dir}"
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
-conda install av=8.0.0 -c conda-forge -y
+conda list
+conda install av=6.2.0 -c conda-forge -y
 conda list

From ce2b071f29a907418d006b2ba6bafab717c2c74c Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:19:07 +0200
Subject: [PATCH 110/128] See what happens with av from pip

---
 .circleci/unittest/linux/scripts/setup_env.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 05249652e49..8a14025d376 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -39,5 +39,5 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-conda install av=6.2.0 -c conda-forge -y
-conda list
+#conda install av=6.2.0 -c conda-forge -y
+#conda list

From d17408f19f49286b49f7e901387d35f1a0fd5911 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:32:44 +0200
Subject: [PATCH 111/128] What is going on?

---
 .circleci/unittest/linux/scripts/environment.yml | 1 +
 .circleci/unittest/linux/scripts/setup_env.sh    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index b9c9fb2be4b..9b56b717f46 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,3 +13,4 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
+    - av
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 8a14025d376..90c7d6206b2 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -34,6 +34,7 @@ if [ ! -d "${env_dir}" ]; then
 fi
 conda activate "${env_dir}"
 
+conda list
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune

From b9f7dd6b22abf3bbc7755096ffd4c386ba9bb635 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:49:26 +0200
Subject: [PATCH 112/128] More tests

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 90c7d6206b2..4043ee74440 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -41,4 +41,5 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
 #conda install av=6.2.0 -c conda-forge -y
-#conda list
+conda install ffmpeg=4.2.2 -c conda-forge -y
+conda list

From df91c8d01b49011946e388b35c2dc466a90f6289 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 2 Oct 2020 18:49:51 +0200
Subject: [PATCH 113/128] Forgot something

---
 .circleci/unittest/linux/scripts/environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 9b56b717f46..b9c9fb2be4b 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,4 +13,3 @@ dependencies:
     - future
     - pillow>=4.1.1
     - scipy
-    - av

From 95d867796d518aad0b8b3798f8139c466d9c11fa Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 3 Oct 2020 18:44:44 +0200
Subject: [PATCH 114/128] Check if cache is messing up with things

---
 .circleci/config.yml                          | 24 +++++++++----------
 .circleci/config.yml.in                       | 24 +++++++++----------
 .circleci/unittest/linux/scripts/setup_env.sh |  4 ++--
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a5e33528dfc..43647e442ac 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -408,21 +408,21 @@ jobs:
           name: Generate cache key
           # This will refresh cache on Sundays, nightly build should generate new cache.
           command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
+          #- restore_cache:
+          #
+          #keys:
+          #  - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #
       - run:
           name: Setup
           command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
+          #- save_cache:
+          #
+          #key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #
+          #paths:
+          #  - conda
+          #  - env
       - run:
           name: Install torchvision
           command: .circleci/unittest/linux/scripts/install.sh
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index fc0539bc682..ae574294f53 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -408,21 +408,21 @@ jobs:
           name: Generate cache key
           # This will refresh cache on Sundays, nightly build should generate new cache.
           command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
+          #- restore_cache:
+          #{% raw %}
+          #keys:
+          #  - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #{% endraw %}
       - run:
           name: Setup
           command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
+          #- save_cache:
+          #{% raw %}
+          #key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          #{% endraw %}
+          #paths:
+          #  - conda
+          #  - env
       - run:
           name: Install torchvision
           command: .circleci/unittest/linux/scripts/install.sh
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 4043ee74440..bd15aa1bb82 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -40,6 +40,6 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-#conda install av=6.2.0 -c conda-forge -y
-conda install ffmpeg=4.2.2 -c conda-forge -y
+conda install av -c conda-forge -y
+#conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From 8b2f6372d9ea8fc686a75f8869c7703ea060da85 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 3 Oct 2020 18:57:20 +0200
Subject: [PATCH 115/128] Now try with different ffmpeg

---
 .circleci/unittest/linux/scripts/setup_env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index bd15aa1bb82..f23de822933 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -40,6 +40,6 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-conda install av -c conda-forge -y
+conda install av ffmpeg=4.2.2 -c conda-forge -y
 #conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From 37122bcada0fdb9ea3e91ad560f8928a907b1b7b Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 3 Oct 2020 19:11:30 +0200
Subject: [PATCH 116/128] Now try with different ffmpeg

---
 .circleci/unittest/linux/scripts/setup_env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index f23de822933..244519fd6a5 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -40,6 +40,7 @@ printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
-conda install av ffmpeg=4.2.2 -c conda-forge -y
+#conda install av=6.2.0 -c conda-forge -y
+conda install av ffmpeg=4.0.2 -c conda-forge -y
 #conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From 5885f28bd2c8092e5f31d7573baa5dab4e902dd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Edgar=20Andr=C3=A9s=20Margffoy=20Tuay?= <andfoy@gmail.com>
Date: Mon, 5 Oct 2020 17:14:21 -0500
Subject: [PATCH 117/128] Do not install av

---
 .circleci/unittest/linux/scripts/setup_env.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 244519fd6a5..750d53a0476 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -41,6 +41,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 conda list
 #conda install av=6.2.0 -c conda-forge -y
-conda install av ffmpeg=4.0.2 -c conda-forge -y
-#conda install ffmpeg=4.2.2 -c conda-forge -y
+# conda install av ffmpeg=4.0.2 -c conda-forge -y
+conda install ffmpeg=4.2.2 -c conda-forge -y
 conda list

From 10095e1a30ecb7e27dec4cfb973711e830113d0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Edgar=20Andr=C3=A9s=20Margffoy=20Tuay?= <andfoy@gmail.com>
Date: Mon, 5 Oct 2020 17:20:42 -0500
Subject: [PATCH 118/128] Test with ffmpeg 4.2

---
 .circleci/unittest/linux/scripts/setup_env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 750d53a0476..558057bc2f3 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -42,5 +42,5 @@ conda env update --file "${this_dir}/environment.yml" --prune
 conda list
 #conda install av=6.2.0 -c conda-forge -y
 # conda install av ffmpeg=4.0.2 -c conda-forge -y
-conda install ffmpeg=4.2.2 -c conda-forge -y
+conda install ffmpeg=4.2 -c conda-forge -y
 conda list

From 825d2ec95eb27303dc3b4948938194cf55efdaa0 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 11:58:16 -0500
Subject: [PATCH 119/128] clean up video tests

---
 test/test_video.py | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 30e0b96ec3a..569c5ab4d7f 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -11,6 +11,14 @@
 import torchvision
 from torchvision.io import _HAS_VIDEO_OPT
 
+try:
+    import av
+
+    # Do a version test too
+    io.video._check_av_available()
+except ImportError:
+    av = None
+
 
 VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
 
@@ -159,8 +167,6 @@ def _decode_frames_by_av_module(
         video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where
             frames are read
     """
-    import av
-
     if video_end_pts is None:
         video_end_pts = float("inf")
     if audio_end_pts is None:
@@ -269,6 +275,7 @@ def _template_read_video(video_object, s=0, e=None):
 
 @unittest.skipIf(_HAS_VIDEO_OPT is False, "Didn't compile with ffmpeg")
 class TestVideo(unittest.TestCase):
+    @unittest.skipIf(av is None, "PyAV unavailable")
     def test_read_video_tensor(self):
         """
         Check if reading the video using the `next` based API yields the
@@ -290,6 +297,29 @@ def test_read_video_tensor(self):
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
 
+    def test_partial_video_reading_fn(self):
+        import random
+
+        torchvision.set_video_backend("video_reader")
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            # select two random points between 0 and duration
+            r = []
+            r.append(random.uniform(0, config.duration))
+            r.append(random.uniform(0, config.duration))
+            s = min(r)
+            e = max(r)
+
+            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            results = _template_read_video(reader, s, e)
+            tv_video, tv_audio, info = torchvision.io.read_video(
+                full_path, start_pts=s, end_pts=e, pts_unit="sec"
+            )
+            self.assertAlmostEqual(
+                tv_video.size(0), results.svframes.ssize(0), delta=2.0
+            )
+
     def test_pts(self):
         """
         Check if every frame read from
@@ -317,6 +347,7 @@ def test_pts(self):
         for i in range(len(napi_pts) - 1):
             self.assertEqual(napi_pts[i] < napi_pts[i + 1], True)
 
+    @unittest.skipIf(av is None, "PyAV unavailable")
     def test_metadata(self):
         """
         Test that the metadata returned via pyav corresponds to the one returned
@@ -373,6 +404,10 @@ def test_video_reading_fn(self):
                 ).item()
                 self.assertEqual(is_same, True)
 
+    def test_partial_reads(self):
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
 
 if __name__ == "__main__":
     unittest.main()

From fd510c8a1f2a6690cd7ece6e0d96506d3376a47f Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 12:02:47 -0500
Subject: [PATCH 120/128] cleaning up the tests a bit to better test partial
 reading

---
 test/test_video.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 569c5ab4d7f..ab6ec74cf10 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -15,7 +15,7 @@
     import av
 
     # Do a version test too
-    io.video._check_av_available()
+    torchvision.io.video._check_av_available()
 except ImportError:
     av = None
 
@@ -311,14 +311,12 @@ def test_partial_video_reading_fn(self):
             s = min(r)
             e = max(r)
 
-            reader = torch.classes.torchvision.Video(full_path, "video", True)
+            reader = torch.classes.torchvision.Video(full_path, "video")
             results = _template_read_video(reader, s, e)
             tv_video, tv_audio, info = torchvision.io.read_video(
                 full_path, start_pts=s, end_pts=e, pts_unit="sec"
             )
-            self.assertAlmostEqual(
-                tv_video.size(0), results.svframes.ssize(0), delta=2.0
-            )
+            self.assertAlmostEqual(tv_video.size(0), results.vframes.size(0), delta=2.0)
 
     def test_pts(self):
         """

From c0aeb54b72aec104a35453cd838cc3270b79b049 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 12:06:43 -0500
Subject: [PATCH 121/128] arrgh linter

---
 test/test_video.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index ab6ec74cf10..ea47a9e4ad6 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -402,10 +402,6 @@ def test_video_reading_fn(self):
                 ).item()
                 self.assertEqual(is_same, True)
 
-    def test_partial_reads(self):
-        for test_video, config in test_videos.items():
-            full_path = os.path.join(VIDEO_DIR, test_video)
-
 
 if __name__ == "__main__":
     unittest.main()

From df6a6128c8a2fb075efaa106921c4264c3b29196 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 12:55:03 -0500
Subject: [PATCH 122/128] Forgot the av test

---
 test/test_video.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_video.py b/test/test_video.py
index ea47a9e4ad6..94a2b55ff8d 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -363,6 +363,7 @@ def test_metadata(self):
                 config.duration, reader_md["video"]["duration"][0], delta=0.5
             )
 
+    @unittest.skipIf(av is None, "PyAV unavailable")
     def test_video_reading_fn(self):
         """
         Test that the outputs of the pyav and ffmpeg outputs are mostly the same

From d91021492d5c7df9d98b9c6e6dbfc736fdae4c68 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 12:59:53 -0500
Subject: [PATCH 123/128] forgot av test

---
 test/test_video.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 94a2b55ff8d..57aeecf6573 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -3,6 +3,7 @@
 import contextlib
 import tempfile
 import unittest
+import random
 
 
 import numpy as np
@@ -298,8 +299,6 @@ def test_read_video_tensor(self):
             self.assertEqual(tv_result.size(), new_api.size())
 
     def test_partial_video_reading_fn(self):
-        import random
-
         torchvision.set_video_backend("video_reader")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)

From bdd3b08853fc1d6a76924ab27a9f3767d0233f6f Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 13:50:39 -0500
Subject: [PATCH 124/128] checkout build files from master

---
 .circleci/config.yml                          | 24 +++++++++----------
 .../unittest/linux/scripts/environment.yml    |  7 +++---
 .circleci/unittest/linux/scripts/setup_env.sh |  6 -----
 .travis.yml                                   |  2 +-
 4 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 43647e442ac..a5e33528dfc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -408,21 +408,21 @@ jobs:
           name: Generate cache key
           # This will refresh cache on Sundays, nightly build should generate new cache.
           command: echo "$(date +"%Y-%U")" > .circleci-weekly
-          #- restore_cache:
-          #
-          #keys:
-          #  - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          #
+      - restore_cache:
+
+          keys:
+            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
       - run:
           name: Setup
           command: .circleci/unittest/linux/scripts/setup_env.sh
-          #- save_cache:
-          #
-          #key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          #
-          #paths:
-          #  - conda
-          #  - env
+      - save_cache:
+
+          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - conda
+            - env
       - run:
           name: Install torchvision
           command: .circleci/unittest/linux/scripts/install.sh
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 08f78d79a89..91275ff31bd 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -12,6 +12,7 @@ dependencies:
   - ffmpeg=4.2
   - ca-certificates
   - pip:
-      - future
-      - pillow>=4.1.1
-      - scipy
+    - future
+    - pillow>=4.1.1
+    - scipy
+    - av
\ No newline at end of file
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index d572d6d0ebc..44ee98b91d0 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -34,12 +34,6 @@ if [ ! -d "${env_dir}" ]; then
 fi
 conda activate "${env_dir}"
 
-conda list
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 conda env update --file "${this_dir}/environment.yml" --prune
-
-#conda install av=6.2.0 -c conda-forge -y
-# conda install av ffmpeg=4.0.2 -c conda-forge -y
-conda install ffmpeg=4.2 -c conda-forge -y
-conda list
diff --git a/.travis.yml b/.travis.yml
index d7c4c4f0f28..44fd9ae3c16 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,9 +36,9 @@ before_install:
       pip install -q --user typing-extensions==3.6.6
       pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122
     fi
-  - conda install -c conda-forge ffmpeg=4.2
   - conda install av -c conda-forge
 
+
 install:
   # Using pip instead of setup.py ensures we install a non-compressed version of the package
   # (as opposed to an egg), which is necessary to collect coverage.

From 36fcaf2e99dfee6f4b40b988fc1e071cc5fab11d Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 14:03:25 -0500
Subject: [PATCH 125/128] revert circleci

---
 .circleci/config.yml.in | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index ae574294f53..fc0539bc682 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -408,21 +408,21 @@ jobs:
           name: Generate cache key
           # This will refresh cache on Sundays, nightly build should generate new cache.
           command: echo "$(date +"%Y-%U")" > .circleci-weekly
-          #- restore_cache:
-          #{% raw %}
-          #keys:
-          #  - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          #{% endraw %}
+      - restore_cache:
+          {% raw %}
+          keys:
+            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          {% endraw %}
       - run:
           name: Setup
           command: .circleci/unittest/linux/scripts/setup_env.sh
-          #- save_cache:
-          #{% raw %}
-          #key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          #{% endraw %}
-          #paths:
-          #  - conda
-          #  - env
+      - save_cache:
+          {% raw %}
+          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+          {% endraw %}
+          paths:
+            - conda
+            - env
       - run:
           name: Install torchvision
           command: .circleci/unittest/linux/scripts/install.sh

From 03bd6f438e2e3dcde08551a665d134d7a1d94958 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 14:22:58 -0500
Subject: [PATCH 126/128] addressing Franciscos comments

---
 torchvision/csrc/cpu/video/Video.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/torchvision/csrc/cpu/video/Video.h b/torchvision/csrc/cpu/video/Video.h
index 453ba61f78a..8060adfcfce 100644
--- a/torchvision/csrc/cpu/video/Video.h
+++ b/torchvision/csrc/cpu/video/Video.h
@@ -1,8 +1,5 @@
 #pragma once
 
-#ifndef VIDEO_H_
-#define VIDEO_H_
-
 #include <map>
 #include <regex>
 #include <string>
@@ -62,5 +59,3 @@ struct Video : torch::CustomClassHolder {
   DecoderParameters params;
 
 }; // struct Video
-
-#endif // VIDEO_H_

From e4e3765cbca368c8a09c39aa80bf115f0c84c1cf Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Tue, 6 Oct 2020 14:29:08 -0500
Subject: [PATCH 127/128] addressing Franciscos comments

---
 test/test_video.py | 90 +++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/test/test_video.py b/test/test_video.py
index 57aeecf6573..63434fa9c1f 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -298,51 +298,51 @@ def test_read_video_tensor(self):
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
 
-    def test_partial_video_reading_fn(self):
-        torchvision.set_video_backend("video_reader")
-        for test_video, config in test_videos.items():
-            full_path = os.path.join(VIDEO_DIR, test_video)
-
-            # select two random points between 0 and duration
-            r = []
-            r.append(random.uniform(0, config.duration))
-            r.append(random.uniform(0, config.duration))
-            s = min(r)
-            e = max(r)
-
-            reader = torch.classes.torchvision.Video(full_path, "video")
-            results = _template_read_video(reader, s, e)
-            tv_video, tv_audio, info = torchvision.io.read_video(
-                full_path, start_pts=s, end_pts=e, pts_unit="sec"
-            )
-            self.assertAlmostEqual(tv_video.size(0), results.vframes.size(0), delta=2.0)
-
-    def test_pts(self):
-        """
-        Check if every frame read from
-        """
-        torchvision.set_video_backend("video_reader")
-        for test_video, config in test_videos.items():
-            full_path = os.path.join(VIDEO_DIR, test_video)
-
-            tv_timestamps, _ = torchvision.io.read_video_timestamps(
-                full_path, pts_unit="sec"
-            )
-            # pass 2: decode all frames using new api
-            reader = torch.classes.torchvision.Video(full_path, "video")
-            pts = []
-            t, p = reader.next()
-            while t.numel() > 0:
-                pts.append(p)
-                t, p = reader.next()
-
-            tv_timestamps = [float(p) for p in tv_timestamps]
-            napi_pts = [float(p) for p in pts]
-            for i in range(len(napi_pts)):
-                self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
-        # check if pts of video frames are sorted in ascending order
-        for i in range(len(napi_pts) - 1):
-            self.assertEqual(napi_pts[i] < napi_pts[i + 1], True)
+    # def test_partial_video_reading_fn(self):
+    #     torchvision.set_video_backend("video_reader")
+    #     for test_video, config in test_videos.items():
+    #         full_path = os.path.join(VIDEO_DIR, test_video)
+
+    #         # select two random points between 0 and duration
+    #         r = []
+    #         r.append(random.uniform(0, config.duration))
+    #         r.append(random.uniform(0, config.duration))
+    #         s = min(r)
+    #         e = max(r)
+
+    #         reader = torch.classes.torchvision.Video(full_path, "video")
+    #         results = _template_read_video(reader, s, e)
+    #         tv_video, tv_audio, info = torchvision.io.read_video(
+    #             full_path, start_pts=s, end_pts=e, pts_unit="sec"
+    #         )
+    #         self.assertAlmostEqual(tv_video.size(0), results.vframes.size(0), delta=2.0)
+
+    # def test_pts(self):
+    #     """
+    #     Check if every frame read from
+    #     """
+    #     torchvision.set_video_backend("video_reader")
+    #     for test_video, config in test_videos.items():
+    #         full_path = os.path.join(VIDEO_DIR, test_video)
+
+    #         tv_timestamps, _ = torchvision.io.read_video_timestamps(
+    #             full_path, pts_unit="sec"
+    #         )
+    #         # pass 2: decode all frames using new api
+    #         reader = torch.classes.torchvision.Video(full_path, "video")
+    #         pts = []
+    #         t, p = reader.next()
+    #         while t.numel() > 0:
+    #             pts.append(p)
+    #             t, p = reader.next()
+
+    #         tv_timestamps = [float(p) for p in tv_timestamps]
+    #         napi_pts = [float(p) for p in pts]
+    #         for i in range(len(napi_pts)):
+    #             self.assertAlmostEqual(napi_pts[i], tv_timestamps[i], delta=0.001)
+    #     # check if pts of video frames are sorted in ascending order
+    #     for i in range(len(napi_pts) - 1):
+    #         self.assertEqual(napi_pts[i] < napi_pts[i + 1], True)
 
     @unittest.skipIf(av is None, "PyAV unavailable")
     def test_metadata(self):

From aae1d4f61c8e9dcc4ba278b28a47aa066637f2b5 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Wed, 7 Oct 2020 04:53:17 -0500
Subject: [PATCH 128/128] Ignore ffmpeg in travis

---
 .travis.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 44fd9ae3c16..789c355765f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,7 +38,6 @@ before_install:
     fi
   - conda install av -c conda-forge
 
-
 install:
   # Using pip instead of setup.py ensures we install a non-compressed version of the package
   # (as opposed to an egg), which is necessary to collect coverage.
@@ -55,7 +54,7 @@ install:
     cd -
 
 script:
-  - pytest --cov-config .coveragerc --cov torchvision --cov $TV_INSTALL_PATH -k 'not TestVideoReader and not TestVideoTransforms and not TestIO' test --ignore=test/test_datasets_download.py
+  - pytest --cov-config .coveragerc --cov torchvision --cov $TV_INSTALL_PATH -k 'not TestVideo and not TestVideoReader and not TestVideoTransforms and not TestIO' test --ignore=test/test_datasets_download.py
   - pytest test/test_hub.py
 
 after_success: