Also unify variable format index for audio

vapoursynth · Nov 5, 2024 · 2817e14 · 2817e14
1 parent 4f6ef06
commit 2817e14
Show file tree

Hide file tree

Showing 7 changed files with 199 additions and 79 deletions.
diff --git a/src/audiosource.cpp b/src/audiosource.cpp
diff --git a/src/audiosource.h b/src/audiosource.h
@@ -45,13 +45,24 @@ struct BSAudioFormat {
     void Set(int Format, int BitsPerRawSample);
 };
 
-struct BSAudioProperties {
+// int format, uint64_t ChannelLayout, int samplerate
+
+struct LWAudioProperties {
+    BSRational TimeBase;
+    int64_t Duration;
+
+    int64_t NumSamples; /* estimated by decoder, may be wrong */
+};
+
+struct BSAudioProperties : public LWAudioProperties {
     BSAudioFormat AF;
+    int Format;
     int SampleRate;
     int Channels;
     uint64_t ChannelLayout;
-    int64_t NumFrames; // can be -1 to signal that the number of frames is completely unknown
-    int64_t NumSamples; /* estimated by decoder, may be wrong */
+
+    int64_t NumFrames;
+
     double StartTime; /* in seconds */
 };
 
@@ -67,20 +78,20 @@ class LWAudioDecoder {
     AVPacket *Packet = nullptr;
     bool Seeked = false;
 
-    void OpenFile(const std::filesystem::path &SourceFile, int Track, bool VariableFormat, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale);
+    void OpenFile(const std::filesystem::path &SourceFile, int Track, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale);
     bool ReadPacket();
     bool DecodeNextFrame(bool SkipOutput = false);
     void Free();
 public:
-    LWAudioDecoder(const std::filesystem::path &SourceFile, int Track, bool VariableFormat, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale); // Positive track numbers are absolute. Negative track numbers mean nth audio track to simplify things.
+    LWAudioDecoder(const std::filesystem::path &SourceFile, int Track, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale); // Positive track numbers are absolute. Negative track numbers mean nth audio track to simplify things.
     ~LWAudioDecoder();
     [[nodiscard]] int64_t GetSourceSize() const;
     [[nodiscard]] int64_t GetSourcePostion() const;
     [[nodiscard]] int GetTrack() const; // Useful when opening nth video track to get the actual number
     [[nodiscard]] int64_t GetFrameNumber() const; // The frame you will get when calling GetNextFrame()
     [[nodiscard]] int64_t GetSamplePos() const; // The frame you will get when calling GetNextFrame()
     void SetFrameNumber(int64_t N, int64_t SampleNumber); // Use after seeking to update internal frame number
-    void GetAudioProperties(BSAudioProperties &VP); // Decodes one frame and advances the position to retrieve the full properties, only call directly after creation
+    void GetAudioProperties(LWAudioProperties &VP); // Decodes one frame and advances the position to retrieve the full properties, only call directly after creation
     [[nodiscard]] AVFrame *GetNextFrame();
     bool SkipFrames(int64_t Count);
     [[nodiscard]] bool HasMoreFrames() const;
@@ -105,10 +116,27 @@ class BestAudioFrame {
 
 class BestAudioSource {
 public:
+    struct FormatSet {
+        BSAudioFormat AF = {};
+        int Format;
+        int SampleRate;
+        int Channels;
+        uint64_t ChannelLayout;
+
+        double StartTime = 0;
+
+        int64_t NumFrames; // can be -1 to signal that the number of frames is completely unknown
+        int64_t NumSamples;
+    };
+
     struct FrameInfo {
         int64_t PTS;
         int64_t Start;
         int64_t Length;
+        int Format;
+        int SampleRate;
+        int Channels;
+        uint64_t ChannelLayout;
         std::array<uint8_t, HashSize> Hash;
     };
 private:
@@ -144,13 +172,16 @@ class BestAudioSource {
     AudioTrackIndex TrackIndex;
     Cache FrameCache;
 
+    std::vector<FormatSet> FormatSets;
+    FormatSet DefaultFormatSet;
+
     static constexpr int MaxVideoSources = 4;
     std::map<std::string, std::string> LAVFOptions;
     double DrcScale;
     BSAudioProperties AP = {};
     std::filesystem::path Source;
     int AudioTrack;
-    bool VariableFormat;
+    int VariableFormat = -1;
     int Threads;
     bool LinearMode = false;
     uint64_t DecoderSequenceNum = 0;
@@ -167,6 +198,7 @@ class BestAudioSource {
     [[nodiscard]] BestAudioFrame *GetFrameInternal(int64_t N);
     [[nodiscard]] BestAudioFrame *GetFrameLinearInternal(int64_t N, int64_t SeekFrame = -1, size_t Depth = 0, bool ForceUnseeked = false);
     [[nodiscard]] bool IndexTrack(const ProgressFunction &Progress = nullptr);
+    void InitializeFormatSets();
     void ZeroFillStartPacked(uint8_t *&Data, int64_t &Start, int64_t &Count);
     void ZeroFillEndPacked(uint8_t *Data, int64_t Start, int64_t &Count);
     bool FillInFramePacked(const BestAudioFrame *Frame, int64_t FrameStartSample, uint8_t *&Data, int64_t &Start, int64_t &Count);
@@ -180,12 +212,14 @@ class BestAudioSource {
         int64_t FirstSamplePos;
     };
 
-    BestAudioSource(const std::filesystem::path &SourceFile, int Track, int AjustDelay, bool VariableFormat, int Threads, int CacheMode, const std::filesystem::path &CachePath, const std::map<std::string, std::string> *LAVFOpts, double DrcScale, const ProgressFunction &Progress = nullptr);
+    BestAudioSource(const std::filesystem::path &SourceFile, int Track, int AjustDelay, int Threads, int CacheMode, const std::filesystem::path &CachePath, const std::map<std::string, std::string> *LAVFOpts, double DrcScale, const ProgressFunction &Progress = nullptr);
     [[nodiscard]] int GetTrack() const; // Useful when opening nth video track to get the actual number
     void SetMaxCacheSize(size_t Bytes); /* default max size is 1GB */
     void SetSeekPreRoll(int64_t Frames); /* the number of frames to cache before the position being fast forwarded to */
     double GetRelativeStartTime(int Track) const;
     [[nodiscard]] const BSAudioProperties &GetAudioProperties() const;
+    [[nodiscard]] const std::vector<FormatSet> &GetFormatSets() const; /* Get a listing of all the number of formats  */
+    void SelectFormatSet(int Index); /* Sets the output format to the specified format set, passing -1 means the default variable format will be used */
     [[nodiscard]] BestAudioFrame *GetFrame(int64_t N, bool Linear = false);
     [[nodiscard]] FrameRange GetFrameRangeBySamples(int64_t Start, int64_t Count) const;
     void GetPackedAudio(uint8_t *Data, int64_t Start, int64_t Count);

diff --git a/src/avisynth.cpp b/src/avisynth.cpp
@@ -304,7 +304,9 @@ class AvisynthAudioSource : public IClip {
             Opts["use_absolute_path"] = "1";
 
         try {
-            A.reset(new BestAudioSource(CreateProbablyUTF8Path(Source), Track, AdjustDelay, false, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
+            A.reset(new BestAudioSource(CreateProbablyUTF8Path(Source), Track, AdjustDelay, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
+
+            A->SelectFormatSet(0);
 
             const BSAudioProperties &AP = A->GetAudioProperties();
             if (AP.AF.Float && AP.AF.Bits == 32) {

diff --git a/src/vapoursynth.cpp b/src/vapoursynth.cpp
@@ -319,7 +319,7 @@ static void VS_CC CreateBestAudioSource(const VSMap *In, VSMap *Out, void *, VSC
         if (ShowProgress) {
             auto NextUpdate = std::chrono::high_resolution_clock::now();
             int LastValue = -1;
-            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, false, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale,
+            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale,
                 [vsapi, Core, &NextUpdate, &LastValue](int Track, int64_t Cur, int64_t Total) {
                     if (NextUpdate < std::chrono::high_resolution_clock::now()) {
                         if (Total == INT64_MAX && Cur == Total) {
@@ -337,9 +337,11 @@ static void VS_CC CreateBestAudioSource(const VSMap *In, VSMap *Out, void *, VSC
                 }));
 
         } else {
-            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, false, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
+            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
         }
 
+        D->A->SelectFormatSet(0);
+
         const BSAudioProperties &AP = D->A->GetAudioProperties();
         D->Is8Bit = (AP.AF.Bits <= 8);
         if (!vsapi->queryAudioFormat(&D->AI.format, AP.AF.Float, D->Is8Bit ? 16 : AP.AF.Bits, AP.ChannelLayout, Core))

diff --git a/src/version.h b/src/version.h
@@ -21,7 +21,7 @@
 #ifndef VERSION_H
 #define VERSION_H
 
-#define BEST_SOURCE_VERSION_MAJOR 6
+#define BEST_SOURCE_VERSION_MAJOR 9
 #define BEST_SOURCE_VERSION_MINOR 0
 
 #endif
diff --git a/src/videosource.cpp b/src/videosource.cpp
@@ -1398,10 +1398,10 @@ bool BestVideoSource::InitializeRFF() {
 }
 
 void BestVideoSource::InitializeFormatSets() {
-    std::map<std::tuple<int, int, int>, std::tuple<int64_t, int64_t, int64_t, const FrameInfo *>> SeenSets;
+    std::map<std::tuple<int, int, int>, std::tuple<int64_t, int64_t, int64_t, bool>> SeenSets;
     for (const auto &Iter : TrackIndex.Frames) {
         auto V = std::make_tuple(Iter.Format, Iter.Width, Iter.Height);
-        if (SeenSets.insert(std::make_pair(V, std::make_tuple(0, 0, 0, &Iter))).second)
+        if (SeenSets.insert(std::make_pair(V, std::make_tuple(0, 0, Iter.PTS, Iter.TFF))).second)
             FormatSets.push_back(FormatSet{ {}, Iter.Format, Iter.Width, Iter.Height });
         std::get<0>(SeenSets[V])++;
         std::get<1>(SeenSets[V]) += Iter.RepeatPict + 2;
@@ -1410,24 +1410,31 @@ void BestVideoSource::InitializeFormatSets() {
     for (auto &Iter : FormatSets) {
         auto V = std::make_tuple(Iter.Format, Iter.Width, Iter.Height);
         Iter.NumFrames = std::get<0>(SeenSets[V]);
-        Iter.NumRFFFrames = (std::get<1>(SeenSets[V]) + 1) / 2;
-        Iter.TFF = std::get<3>(SeenSets[V])->TFF;
-        if (std::get<3>(SeenSets[V])->PTS != AV_NOPTS_VALUE)
-            Iter.StartTime = (static_cast<double>(VP.TimeBase.Num) * std::get<3>(SeenSets[V])->PTS) / VP.TimeBase.Den;      
+        Iter.NumRFFFrames = std::get<1>(SeenSets[V]);
+        Iter.TFF = std::get<3>(SeenSets[V]);
+        if (std::get<2>(SeenSets[V]) != AV_NOPTS_VALUE)
+            Iter.StartTime = (static_cast<double>(VP.TimeBase.Num) * std::get<2>(SeenSets[V])) / VP.TimeBase.Den;      
         Iter.VF.Set(av_pix_fmt_desc_get(static_cast<AVPixelFormat>(Iter.Format)));
     }
 
     DefaultFormatSet = FormatSets[0];
     DefaultFormatSet.NumFrames = TrackIndex.Frames.size();
     DefaultFormatSet.NumRFFFrames = 0;
-    for (const auto &Iter : FormatSets) {
+
+    for (auto &Iter : FormatSets) {
+        DefaultFormatSet.NumRFFFrames += Iter.NumRFFFrames;
+        Iter.NumRFFFrames = (Iter.NumRFFFrames + 1) / 2; // Can't round before adding it together
+
         if (DefaultFormatSet.Format != Iter.Format)
             DefaultFormatSet.Format = AV_PIX_FMT_NONE;
         if (DefaultFormatSet.Width != Iter.Width || DefaultFormatSet.Height != Iter.Height) {
             DefaultFormatSet.Width = 0;
             DefaultFormatSet.Height = 0;
         }
     }
+
+    DefaultFormatSet.NumRFFFrames = (DefaultFormatSet.NumRFFFrames + 1) / 2;
+
     if (DefaultFormatSet.Format != AV_PIX_FMT_NONE)
         DefaultFormatSet.VF.Set(av_pix_fmt_desc_get(static_cast<AVPixelFormat>(DefaultFormatSet.Format)));
     else

diff --git a/src/videosource.h b/src/videosource.h
@@ -84,7 +84,7 @@ struct LWVideoProperties {
 
 struct BSVideoProperties : public LWVideoProperties {
     BSVideoFormat VF;
-    int Format; // fixme, needed?
+    int Format;
 
     int Width;
     int Height;