From 2817e14e91cbea36566b6f7284acb7701062a07e Mon Sep 17 00:00:00 2001
From: Fredrik Mellbin <fredrik.mellbin@gmail.com>
Date: Tue, 5 Nov 2024 19:45:24 +0100
Subject: [PATCH] Also unify variable format index for audio

---
 src/audiosource.cpp | 193 ++++++++++++++++++++++++++++++--------------
 src/audiosource.h   |  50 ++++++++++--
 src/avisynth.cpp    |   4 +-
 src/vapoursynth.cpp |   6 +-
 src/version.h       |   2 +-
 src/videosource.cpp |  21 +++--
 src/videosource.h   |   2 +-
 7 files changed, 199 insertions(+), 79 deletions(-)
diff --git a/src/audiosource.cpp b/src/audiosource.cpp
index b81b201..1349a9f 100644
--- a/src/audiosource.cpp
+++ b/src/audiosource.cpp
@@ -70,7 +70,7 @@ bool LWAudioDecoder::DecodeNextFrame(bool SkipOutput) {
     return false;
 }
 
-void LWAudioDecoder::OpenFile(const std::filesystem::path &SourceFile, int Track, bool VariableFormat, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale) {
+void LWAudioDecoder::OpenFile(const std::filesystem::path &SourceFile, int Track, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale) {
     TrackNumber = Track;
 
     AVDictionary *Dict = nullptr;
@@ -134,12 +134,6 @@ void LWAudioDecoder::OpenFile(const std::filesystem::path &SourceFile, int Track
     }
     CodecContext->thread_count = Threads;
 
-    // FIXME, implement for newer ffmpeg versions
-    if (!VariableFormat) {
-        // Probably guard against mid-stream format changes
-        CodecContext->flags |= AV_CODEC_FLAG_DROPCHANGED;
-    }
-
     if (DrcScale < 0)
         throw BestSourceException("Invalid drc_scale value");
 
@@ -151,10 +145,10 @@ void LWAudioDecoder::OpenFile(const std::filesystem::path &SourceFile, int Track
         throw BestSourceException("Could not open audio codec");
 }
 
-LWAudioDecoder::LWAudioDecoder(const std::filesystem::path &SourceFile, int Track, bool VariableFormat, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale) {
+LWAudioDecoder::LWAudioDecoder(const std::filesystem::path &SourceFile, int Track, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale) {
     try {
         Packet = av_packet_alloc();
-        OpenFile(SourceFile, Track, VariableFormat, Threads, LAVFOpts, DrcScale);
+        OpenFile(SourceFile, Track, Threads, LAVFOpts, DrcScale);
     } catch (...) {
         Free();
         throw;
@@ -197,37 +191,12 @@ void LWAudioDecoder::SetFrameNumber(int64_t N, int64_t SampleNumber) {
     CurrentSample = SampleNumber;
 }
 
-void LWAudioDecoder::GetAudioProperties(BSAudioProperties &AP) {
-    assert(CurrentFrame == 0);
+void LWAudioDecoder::GetAudioProperties(LWAudioProperties &AP) {
     AP = {};
-    AVFrame *PropFrame = GetNextFrame();
-    assert(PropFrame);
-    if (!PropFrame)
-        return;
 
-    AP.AF.Set(PropFrame->format, CodecContext->bits_per_raw_sample);
-    AP.SampleRate = PropFrame->sample_rate;
-    AP.Channels = PropFrame->ch_layout.nb_channels;
-
-    if (PropFrame->ch_layout.order == AV_CHANNEL_ORDER_NATIVE) {
-        AP.ChannelLayout = PropFrame->ch_layout.u.mask;
-    } else if (PropFrame->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC) {
-        AVChannelLayout ch = {};
-        av_channel_layout_default(&ch, PropFrame->ch_layout.nb_channels);
-        AP.ChannelLayout = ch.u.mask;
-    } else {
-        av_frame_free(&PropFrame);
-        throw BestSourceException("Ambisonics and custom channel orders not supported");
-    }
-
-    AP.NumSamples = (FormatContext->duration * PropFrame->sample_rate) / AV_TIME_BASE - FormatContext->streams[TrackNumber]->codecpar->initial_padding;
-    if (PropFrame->pts != AV_NOPTS_VALUE)
-        AP.StartTime = (static_cast<double>(FormatContext->streams[TrackNumber]->time_base.num) * PropFrame->pts) / FormatContext->streams[TrackNumber]->time_base.den;
-
-    av_frame_free(&PropFrame);
-
-    if (AP.AF.Bits <= 0) //FIXME, can this still happen?
-        throw BestSourceException("Codec returned zero size audio");
+    AP.Duration = FormatContext->streams[TrackNumber]->duration;
+    AP.TimeBase = FormatContext->streams[TrackNumber]->time_base;
+    AP.NumSamples = (FormatContext->duration * CodecContext->sample_rate) / AV_TIME_BASE - FormatContext->streams[TrackNumber]->codecpar->initial_padding;
 }
 
 AVFrame *LWAudioDecoder::GetNextFrame() {
@@ -382,8 +351,8 @@ BestAudioFrame *BestAudioSource::Cache::GetFrame(int64_t N) {
     return nullptr;
 }
 
-BestAudioSource::BestAudioSource(const std::filesystem::path &SourceFile, int Track, int AjustDelay, bool VariableFormat, int Threads, int CacheMode, const std::filesystem::path &CachePath, const std::map<std::string, std::string> *LAVFOpts, double DrcScale, const ProgressFunction &Progress)
-    : Source(SourceFile), AudioTrack(Track), VariableFormat(VariableFormat), DrcScale(DrcScale), Threads(Threads) {
+BestAudioSource::BestAudioSource(const std::filesystem::path &SourceFile, int Track, int AjustDelay, int Threads, int CacheMode, const std::filesystem::path &CachePath, const std::map<std::string, std::string> *LAVFOpts, double DrcScale, const ProgressFunction &Progress)
+    : Source(SourceFile), AudioTrack(Track), DrcScale(DrcScale), Threads(Threads) {
     // Only make file path absolute if it exists to pass through special protocol paths
     std::error_code ec;
     if (std::filesystem::exists(SourceFile, ec))
@@ -395,7 +364,7 @@ BestAudioSource::BestAudioSource(const std::filesystem::path &SourceFile, int Tr
     if (CacheMode < 0 || CacheMode > 4)
         throw BestSourceException("CacheMode must be between 0 and 4");
 
-    std::unique_ptr<LWAudioDecoder> Decoder(new LWAudioDecoder(Source, AudioTrack, VariableFormat, Threads, LAVFOptions, DrcScale));
+    std::unique_ptr<LWAudioDecoder> Decoder(new LWAudioDecoder(Source, AudioTrack, Threads, LAVFOptions, DrcScale));
 
     Decoder->GetAudioProperties(AP);
     AudioTrack = Decoder->GetTrack();
@@ -411,9 +380,11 @@ BestAudioSource::BestAudioSource(const std::filesystem::path &SourceFile, int Tr
         }
     }
 
-    AP.NumFrames = TrackIndex.Frames.size();
-    AP.NumSamples = TrackIndex.Frames.back().Start + TrackIndex.Frames.back().Length;
+    InitializeFormatSets();
+    SelectFormatSet(-1);
 
+
+    // FIXME, rework delay adjustment
     if (AjustDelay >= -1)
         SampleDelay = static_cast<int64_t>(GetRelativeStartTime(AjustDelay) * AP.SampleRate);
 
@@ -435,7 +406,7 @@ void BestAudioSource::SetSeekPreRoll(int64_t Frames) {
 }
 
 bool BestAudioSource::IndexTrack(const ProgressFunction &Progress) {
-    std::unique_ptr<LWAudioDecoder> Decoder(new LWAudioDecoder(Source, AudioTrack, VariableFormat, Threads, LAVFOptions, DrcScale));
+    std::unique_ptr<LWAudioDecoder> Decoder(new LWAudioDecoder(Source, AudioTrack, Threads, LAVFOptions, DrcScale));
 
     int64_t FileSize = Progress ? Decoder->GetSourceSize() : -1;
 
@@ -444,9 +415,19 @@ bool BestAudioSource::IndexTrack(const ProgressFunction &Progress) {
     while (true) {
         AVFrame *F = Decoder->GetNextFrame();
         if (!F)
-            break;
+            break;  
+
+        if (F->ch_layout.order == AV_CHANNEL_ORDER_NATIVE) {
+            TrackIndex.Frames.push_back({ F->pts, NumSamples, F->nb_samples, F->format, F->sample_rate, F->ch_layout.nb_channels, F->ch_layout.u.mask, GetHash(F) });
+        } else if (F->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC) {
+            AVChannelLayout ch = {};
+            av_channel_layout_default(&ch, F->ch_layout.nb_channels);
+            TrackIndex.Frames.push_back({ F->pts, NumSamples, F->nb_samples, F->format, F->sample_rate, F->ch_layout.nb_channels, ch.u.mask, GetHash(F) });
+        } else {
+            av_frame_free(&F);
+            throw BestSourceException("Ambisonics and custom channel orders not supported");
+        }
 
-        TrackIndex.Frames.push_back({ F->pts, NumSamples, F->nb_samples, GetHash(F) });
         NumSamples += F->nb_samples;
 
         av_frame_free(&F);
@@ -462,6 +443,46 @@ bool BestAudioSource::IndexTrack(const ProgressFunction &Progress) {
     return !TrackIndex.Frames.empty();
 }
 
+void BestAudioSource::InitializeFormatSets() {
+    std::map<std::tuple<int, int, int, uint64_t>, std::tuple<int64_t, int64_t, int64_t>> SeenSets;
+    for (const auto &Iter : TrackIndex.Frames) {
+        auto V = std::make_tuple(Iter.Format, Iter.SampleRate, Iter.Channels, Iter.ChannelLayout);
+        if (SeenSets.insert(std::make_pair(V, std::make_tuple(0, 0, Iter.PTS))).second)
+            FormatSets.push_back(FormatSet{ {}, Iter.Format, Iter.SampleRate, Iter.Channels, Iter.ChannelLayout });
+        std::get<0>(SeenSets[V])++;
+        std::get<1>(SeenSets[V]) += Iter.Length;
+    }
+
+    for (auto &Iter : FormatSets) {
+        auto V = std::make_tuple(Iter.Format, Iter.SampleRate, Iter.Channels, Iter.ChannelLayout);
+        Iter.NumFrames = std::get<0>(SeenSets[V]);
+        Iter.NumSamples = std::get<1>(SeenSets[V]);
+        if (std::get<2>(SeenSets[V]) != AV_NOPTS_VALUE)
+            Iter.StartTime = (static_cast<double>(AP.TimeBase.Num) * std::get<2>(SeenSets[V])) / AP.TimeBase.Den;
+        Iter.AF.Set(Iter.Format, 0); // FIXME, needs exact bits
+    }
+
+    DefaultFormatSet = FormatSets[0];
+    DefaultFormatSet.NumFrames = TrackIndex.Frames.size();
+    DefaultFormatSet.NumSamples = 0;
+    for (const auto &Iter : FormatSets) {
+        DefaultFormatSet.NumSamples += Iter.NumSamples;
+
+        if (DefaultFormatSet.Format != Iter.Format)
+            DefaultFormatSet.Format = AV_SAMPLE_FMT_NONE;
+
+        if (DefaultFormatSet.SampleRate != Iter.SampleRate)
+            DefaultFormatSet.SampleRate = 0;
+
+        if (DefaultFormatSet.Channels != Iter.Channels || DefaultFormatSet.ChannelLayout != Iter.ChannelLayout) {
+            DefaultFormatSet.Channels = 0;
+            DefaultFormatSet.ChannelLayout = 0;
+        }
+    }
+    if (DefaultFormatSet.Format != AV_SAMPLE_FMT_NONE) 
+        DefaultFormatSet.AF.Set(DefaultFormatSet.Format, 0); // FIXME, needs exact bits
+}
+
 double BestAudioSource::GetRelativeStartTime(int Track) const {
     if (Track < 0) {
         try {
@@ -480,7 +501,7 @@ double BestAudioSource::GetRelativeStartTime(int Track) const {
             return AP.StartTime - VP.StartTime;
         } catch (BestSourceException &) {
             try {
-                std::unique_ptr<LWAudioDecoder> Dec(new LWAudioDecoder(Source, false, Track, Threads, LAVFOptions, 0));
+                std::unique_ptr<LWAudioDecoder> Dec(new LWAudioDecoder(Source, Track, Threads, LAVFOptions, 0));
                 BSAudioProperties AP2;
                 Dec->GetAudioProperties(AP2);
                 return AP.StartTime - AP2.StartTime;
@@ -495,6 +516,29 @@ const BSAudioProperties &BestAudioSource::GetAudioProperties() const {
     return AP;
 }
 
+const std::vector<BestAudioSource::FormatSet> &BestAudioSource::GetFormatSets() const {
+    return FormatSets;
+}
+
+void BestAudioSource::SelectFormatSet(int Index) {
+    if (Index >= static_cast<int>(FormatSets.size()) || Index < -1)
+        throw BestSourceException("Invalid format set");
+    VariableFormat = Index;
+    BestAudioSource::FormatSet &SrcSet = (Index < 0) ? DefaultFormatSet : FormatSets[Index];
+
+    AP.AF = SrcSet.AF;
+    AP.Format = SrcSet.Format;
+    AP.SampleRate = SrcSet.SampleRate;
+    AP.Channels = SrcSet.Channels;
+    AP.ChannelLayout = SrcSet.ChannelLayout;
+
+    AP.StartTime = SrcSet.StartTime;
+
+    AP.NumFrames = SrcSet.NumFrames;
+    AP.NumSamples = SrcSet.NumSamples;
+}
+
+
 // Short algorithm summary
 // 1. If a current decoder is close to the requested frame simply start from there
 //    Determine if a decoder is "close" based on whether or not it is already in the optimal zone based on the existing keyframes
@@ -509,6 +553,21 @@ BestAudioFrame *BestAudioSource::GetFrame(int64_t N, bool Linear) {
     if (N < 0 || N >= AP.NumFrames)
         return nullptr;
 
+    // Adjust frame number if an output format is chosen
+    if (VariableFormat >= 0 && FormatSets.size() > 1) {
+        const auto &ActiveSet = FormatSets[VariableFormat];
+        int64_t UsableFrames = 0;
+        int64_t SourceN = N;
+        for (const auto &Iter : TrackIndex.Frames) {
+            if (Iter.Format != ActiveSet.Format || Iter.SampleRate != ActiveSet.SampleRate || Iter.Channels != ActiveSet.Channels || Iter.ChannelLayout != ActiveSet.ChannelLayout) {
+                N++;
+            } else {
+                if (UsableFrames++ == SourceN)
+                    break;
+            }
+        }
+    }
+
     std::unique_ptr<BestAudioFrame> F(FrameCache.GetFrame(N));
     if (!F)
         F.reset(Linear ? GetFrameLinearInternal(N) : GetFrameInternal(N));
@@ -739,7 +798,7 @@ BestAudioFrame *BestAudioSource::GetFrameInternal(int64_t N) {
 
     int Index = (EmptySlot >= 0) ? EmptySlot : LeastRecentlyUsed;
     if (!Decoders[Index])
-        Decoders[Index].reset(new LWAudioDecoder(Source, AudioTrack, VariableFormat, Threads, LAVFOptions, DrcScale));
+        Decoders[Index].reset(new LWAudioDecoder(Source, AudioTrack, Threads, LAVFOptions, DrcScale));
 
     DecoderLastUse[Index] = DecoderSequenceNum++;
 
@@ -764,7 +823,7 @@ BestAudioFrame *BestAudioSource::GetFrameLinearInternal(int64_t N, int64_t SeekF
     // If an empty slot exists simply spawn a new decoder there or reuse the least recently used decoder slot if no free ones exist
     if (Index < 0) {
         Index = (EmptySlot >= 0) ? EmptySlot : LeastRecentlyUsed;
-        Decoders[Index].reset(new LWAudioDecoder(Source, AudioTrack, VariableFormat, Threads, LAVFOptions, DrcScale));
+        Decoders[Index].reset(new LWAudioDecoder(Source, AudioTrack, Threads, LAVFOptions, DrcScale));
     }
 
     std::unique_ptr<LWAudioDecoder> &Decoder = Decoders[Index];
@@ -979,7 +1038,8 @@ bool BestAudioSource::FillInFramePlanar(const BestAudioFrame *Frame, int64_t Fra
 }
 
 void BestAudioSource::GetPackedAudio(uint8_t *Data, int64_t Start, int64_t Count) {
-    if (VariableFormat)
+    // FIXME, relax the restriction to only requiring the same format within the range if anyone complains
+    if (AP.Format == 0 || AP.Channels == 0 || AP.ChannelLayout == 0 || AP.SampleRate == 0)
         throw BestSourceException("GetPackedAudio() can only be used when variable format is disabled");
 
     Start -= SampleDelay;
@@ -1009,7 +1069,7 @@ void BestAudioSource::GetPackedAudio(uint8_t *Data, int64_t Start, int64_t Count
 }
 
 void BestAudioSource::GetPlanarAudio(uint8_t *const *const Data, int64_t Start, int64_t Count) {
-    if (VariableFormat)
+    if (AP.Format == 0 || AP.Channels == 0 || AP.ChannelLayout == 0 || AP.SampleRate == 0)
         throw BestSourceException("GetPlanarAudio() can only be used when variable format is disabled");
 
     Start -= SampleDelay;
@@ -1046,13 +1106,19 @@ void BestAudioSource::GetPlanarAudio(uint8_t *const *const Data, int64_t Start,
 ////////////////////////////////////////
 // Index read/write
 
-typedef std::array<uint8_t, 16> AudioCompArray;
+typedef std::array<uint8_t, 36> AudioCompArray;
 
-static AudioCompArray GetAudioCompArray(int64_t PTS, int64_t Length) {
+static AudioCompArray GetAudioCompArray(int64_t PTS, int64_t Length, int Format, int SampleRate, int Channels, uint64_t ChannelLayout) {
     AudioCompArray Result;
     memcpy(Result.data(), &PTS, sizeof(PTS));
     memcpy(Result.data() + sizeof(PTS), &Length, sizeof(Length));
-    return Result;
+    memcpy(Result.data() + sizeof(PTS) + sizeof(Length), &Format, sizeof(Format));
+
+    memcpy(Result.data() + sizeof(PTS) + sizeof(Length) + sizeof(Format), &SampleRate, sizeof(SampleRate));
+    memcpy(Result.data() + sizeof(PTS) + sizeof(Length) + sizeof(Format) + sizeof(SampleRate), &Channels, sizeof(Channels));
+    memcpy(Result.data() + sizeof(PTS) + sizeof(Length) + sizeof(Format) + sizeof(SampleRate) + sizeof(Channels), &ChannelLayout, sizeof(ChannelLayout));
+
+    return Result;    
 }
 
 bool BestAudioSource::WriteAudioTrackIndex(bool AbsolutePath, const std::filesystem::path &CachePath) {
@@ -1062,7 +1128,6 @@ bool BestAudioSource::WriteAudioTrackIndex(bool AbsolutePath, const std::filesys
     WriteBSHeader(F, false);
     WriteInt64(F, FileSize);
     WriteInt(F, AudioTrack);
-    WriteInt(F, VariableFormat);
     WriteDouble(F, DrcScale);
 
     WriteInt(F, static_cast<int>(LAVFOptions.size()));
@@ -1088,7 +1153,7 @@ bool BestAudioSource::WriteAudioTrackIndex(bool AbsolutePath, const std::filesys
             LastPTSValue = OrigPTS;
         }
 
-        Dict.insert(std::make_pair(GetAudioCompArray(PTS, Iter.Length), 0));
+        Dict.insert(std::make_pair(GetAudioCompArray(PTS, Iter.Length, Iter.Format, Iter.SampleRate, Iter.Channels, Iter.ChannelLayout), 0));
     }
 
     // Only bother with a dictionary if it's not too big
@@ -1113,7 +1178,7 @@ bool BestAudioSource::WriteAudioTrackIndex(bool AbsolutePath, const std::filesys
                 LastPTSValue = OrigPTS;
             }
 
-            WriteByte(F, Dict[GetAudioCompArray(PTS, Iter.Length)]);
+            WriteByte(F, Dict[GetAudioCompArray(PTS, Iter.Length, Iter.Format, Iter.SampleRate, Iter.Channels, Iter.ChannelLayout)]);
             fwrite(Iter.Hash.data(), 1, Iter.Hash.size(), F.get());
         }
     } else {
@@ -1123,6 +1188,10 @@ bool BestAudioSource::WriteAudioTrackIndex(bool AbsolutePath, const std::filesys
             fwrite(Iter.Hash.data(), 1, Iter.Hash.size(), F.get());
             WriteInt64(F, Iter.PTS);
             WriteInt64(F, Iter.Length);
+            WriteInt(F, Iter.Format);
+            WriteInt(F, Iter.SampleRate);
+            WriteInt(F, Iter.Channels);
+            WriteInt64(F, Iter.ChannelLayout);
         }
     }
 
@@ -1139,8 +1208,6 @@ bool BestAudioSource::ReadAudioTrackIndex(bool AbsolutePath, const std::filesyst
         return false;
     if (!ReadCompareInt(F, AudioTrack))
         return false;
-    if (!ReadCompareInt(F, VariableFormat))
-        return false;
     if (!ReadCompareDouble(F, DrcScale))
         return false;
 
@@ -1166,6 +1233,10 @@ bool BestAudioSource::ReadAudioTrackIndex(bool AbsolutePath, const std::filesyst
             FrameInfo FI = {};
             FI.PTS = ReadInt64(F);
             FI.Length = ReadInt64(F);
+            FI.Format = ReadInt(F);
+            FI.SampleRate = ReadInt(F);
+            FI.Channels = ReadInt(F);
+            FI.ChannelLayout = ReadInt64(F);
             Dict[i] = FI;
         }
 
@@ -1189,6 +1260,10 @@ bool BestAudioSource::ReadAudioTrackIndex(bool AbsolutePath, const std::filesyst
             FI.PTS = ReadInt64(F);
             FI.Start = AP.NumSamples;
             FI.Length = ReadInt64(F);
+            FI.Format = ReadInt(F);
+            FI.SampleRate = ReadInt(F);
+            FI.Channels = ReadInt(F);
+            FI.ChannelLayout = ReadInt64(F);
             AP.NumSamples += FI.Length;
             TrackIndex.Frames.push_back(FI);
         }
diff --git a/src/audiosource.h b/src/audiosource.h
index 0f5e857..86bd162 100644
--- a/src/audiosource.h
+++ b/src/audiosource.h
@@ -45,13 +45,24 @@ struct BSAudioFormat {
     void Set(int Format, int BitsPerRawSample);
 };
 
-struct BSAudioProperties {
+// int format, uint64_t ChannelLayout, int samplerate
+
+struct LWAudioProperties {
+    BSRational TimeBase;
+    int64_t Duration;
+
+    int64_t NumSamples; /* estimated by decoder, may be wrong */
+};
+
+struct BSAudioProperties : public LWAudioProperties {
     BSAudioFormat AF;
+    int Format;
     int SampleRate;
     int Channels;
     uint64_t ChannelLayout;
-    int64_t NumFrames; // can be -1 to signal that the number of frames is completely unknown
-    int64_t NumSamples; /* estimated by decoder, may be wrong */
+
+    int64_t NumFrames;
+
     double StartTime; /* in seconds */
 };
 
@@ -67,12 +78,12 @@ class LWAudioDecoder {
     AVPacket *Packet = nullptr;
     bool Seeked = false;
 
-    void OpenFile(const std::filesystem::path &SourceFile, int Track, bool VariableFormat, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale);
+    void OpenFile(const std::filesystem::path &SourceFile, int Track, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale);
     bool ReadPacket();
     bool DecodeNextFrame(bool SkipOutput = false);
     void Free();
 public:
-    LWAudioDecoder(const std::filesystem::path &SourceFile, int Track, bool VariableFormat, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale); // Positive track numbers are absolute. Negative track numbers mean nth audio track to simplify things.
+    LWAudioDecoder(const std::filesystem::path &SourceFile, int Track, int Threads, const std::map<std::string, std::string> &LAVFOpts, double DrcScale); // Positive track numbers are absolute. Negative track numbers mean nth audio track to simplify things.
     ~LWAudioDecoder();
     [[nodiscard]] int64_t GetSourceSize() const;
     [[nodiscard]] int64_t GetSourcePostion() const;
@@ -80,7 +91,7 @@ class LWAudioDecoder {
     [[nodiscard]] int64_t GetFrameNumber() const; // The frame you will get when calling GetNextFrame()
     [[nodiscard]] int64_t GetSamplePos() const; // The frame you will get when calling GetNextFrame()
     void SetFrameNumber(int64_t N, int64_t SampleNumber); // Use after seeking to update internal frame number
-    void GetAudioProperties(BSAudioProperties &VP); // Decodes one frame and advances the position to retrieve the full properties, only call directly after creation
+    void GetAudioProperties(LWAudioProperties &VP); // Decodes one frame and advances the position to retrieve the full properties, only call directly after creation
     [[nodiscard]] AVFrame *GetNextFrame();
     bool SkipFrames(int64_t Count);
     [[nodiscard]] bool HasMoreFrames() const;
@@ -105,10 +116,27 @@ class BestAudioFrame {
 
 class BestAudioSource {
 public:
+    struct FormatSet {
+        BSAudioFormat AF = {};
+        int Format;
+        int SampleRate;
+        int Channels;
+        uint64_t ChannelLayout;
+
+        double StartTime = 0;
+
+        int64_t NumFrames; // can be -1 to signal that the number of frames is completely unknown
+        int64_t NumSamples;
+    };
+
     struct FrameInfo {
         int64_t PTS;
         int64_t Start;
         int64_t Length;
+        int Format;
+        int SampleRate;
+        int Channels;
+        uint64_t ChannelLayout;
         std::array<uint8_t, HashSize> Hash;
     };
 private:
@@ -144,13 +172,16 @@ class BestAudioSource {
     AudioTrackIndex TrackIndex;
     Cache FrameCache;
 
+    std::vector<FormatSet> FormatSets;
+    FormatSet DefaultFormatSet;
+
     static constexpr int MaxVideoSources = 4;
     std::map<std::string, std::string> LAVFOptions;
     double DrcScale;
     BSAudioProperties AP = {};
     std::filesystem::path Source;
     int AudioTrack;
-    bool VariableFormat;
+    int VariableFormat = -1;
     int Threads;
     bool LinearMode = false;
     uint64_t DecoderSequenceNum = 0;
@@ -167,6 +198,7 @@ class BestAudioSource {
     [[nodiscard]] BestAudioFrame *GetFrameInternal(int64_t N);
     [[nodiscard]] BestAudioFrame *GetFrameLinearInternal(int64_t N, int64_t SeekFrame = -1, size_t Depth = 0, bool ForceUnseeked = false);
     [[nodiscard]] bool IndexTrack(const ProgressFunction &Progress = nullptr);
+    void InitializeFormatSets();
     void ZeroFillStartPacked(uint8_t *&Data, int64_t &Start, int64_t &Count);
     void ZeroFillEndPacked(uint8_t *Data, int64_t Start, int64_t &Count);
     bool FillInFramePacked(const BestAudioFrame *Frame, int64_t FrameStartSample, uint8_t *&Data, int64_t &Start, int64_t &Count);
@@ -180,12 +212,14 @@ class BestAudioSource {
         int64_t FirstSamplePos;
     };
 
-    BestAudioSource(const std::filesystem::path &SourceFile, int Track, int AjustDelay, bool VariableFormat, int Threads, int CacheMode, const std::filesystem::path &CachePath, const std::map<std::string, std::string> *LAVFOpts, double DrcScale, const ProgressFunction &Progress = nullptr);
+    BestAudioSource(const std::filesystem::path &SourceFile, int Track, int AjustDelay, int Threads, int CacheMode, const std::filesystem::path &CachePath, const std::map<std::string, std::string> *LAVFOpts, double DrcScale, const ProgressFunction &Progress = nullptr);
     [[nodiscard]] int GetTrack() const; // Useful when opening nth video track to get the actual number
     void SetMaxCacheSize(size_t Bytes); /* default max size is 1GB */
     void SetSeekPreRoll(int64_t Frames); /* the number of frames to cache before the position being fast forwarded to */
     double GetRelativeStartTime(int Track) const;
     [[nodiscard]] const BSAudioProperties &GetAudioProperties() const;
+    [[nodiscard]] const std::vector<FormatSet> &GetFormatSets() const; /* Get a listing of all the number of formats  */
+    void SelectFormatSet(int Index); /* Sets the output format to the specified format set, passing -1 means the default variable format will be used */
     [[nodiscard]] BestAudioFrame *GetFrame(int64_t N, bool Linear = false);
     [[nodiscard]] FrameRange GetFrameRangeBySamples(int64_t Start, int64_t Count) const;
     void GetPackedAudio(uint8_t *Data, int64_t Start, int64_t Count);
diff --git a/src/avisynth.cpp b/src/avisynth.cpp
index 2de58a2..1111d31 100644
--- a/src/avisynth.cpp
+++ b/src/avisynth.cpp
@@ -304,7 +304,9 @@ class AvisynthAudioSource : public IClip {
             Opts["use_absolute_path"] = "1";
 
         try {
-            A.reset(new BestAudioSource(CreateProbablyUTF8Path(Source), Track, AdjustDelay, false, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
+            A.reset(new BestAudioSource(CreateProbablyUTF8Path(Source), Track, AdjustDelay, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
+
+            A->SelectFormatSet(0);
 
             const BSAudioProperties &AP = A->GetAudioProperties();
             if (AP.AF.Float && AP.AF.Bits == 32) {
diff --git a/src/vapoursynth.cpp b/src/vapoursynth.cpp
index ce64c84..46c5243 100644
--- a/src/vapoursynth.cpp
+++ b/src/vapoursynth.cpp
@@ -319,7 +319,7 @@ static void VS_CC CreateBestAudioSource(const VSMap *In, VSMap *Out, void *, VSC
         if (ShowProgress) {
             auto NextUpdate = std::chrono::high_resolution_clock::now();
             int LastValue = -1;
-            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, false, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale,
+            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale,
                 [vsapi, Core, &NextUpdate, &LastValue](int Track, int64_t Cur, int64_t Total) {
                     if (NextUpdate < std::chrono::high_resolution_clock::now()) {
                         if (Total == INT64_MAX && Cur == Total) {
@@ -337,9 +337,11 @@ static void VS_CC CreateBestAudioSource(const VSMap *In, VSMap *Out, void *, VSC
                 }));
 
         } else {
-            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, false, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
+            D->A.reset(new BestAudioSource(Source, Track, AdjustDelay, Threads, CacheMode, CachePath ? CachePath : "", &Opts, DrcScale));
         }
 
+        D->A->SelectFormatSet(0);
+
         const BSAudioProperties &AP = D->A->GetAudioProperties();
         D->Is8Bit = (AP.AF.Bits <= 8);
         if (!vsapi->queryAudioFormat(&D->AI.format, AP.AF.Float, D->Is8Bit ? 16 : AP.AF.Bits, AP.ChannelLayout, Core))
diff --git a/src/version.h b/src/version.h
index 78b6472..d72a460 100644
--- a/src/version.h
+++ b/src/version.h
@@ -21,7 +21,7 @@
 #ifndef VERSION_H
 #define VERSION_H
 
-#define BEST_SOURCE_VERSION_MAJOR 6
+#define BEST_SOURCE_VERSION_MAJOR 9
 #define BEST_SOURCE_VERSION_MINOR 0
 
 #endif
\ No newline at end of file
diff --git a/src/videosource.cpp b/src/videosource.cpp
index 3ef9724..e2bdcc6 100644
--- a/src/videosource.cpp
+++ b/src/videosource.cpp
@@ -1398,10 +1398,10 @@ bool BestVideoSource::InitializeRFF() {
 }
 
 void BestVideoSource::InitializeFormatSets() {
-    std::map<std::tuple<int, int, int>, std::tuple<int64_t, int64_t, int64_t, const FrameInfo *>> SeenSets;
+    std::map<std::tuple<int, int, int>, std::tuple<int64_t, int64_t, int64_t, bool>> SeenSets;
     for (const auto &Iter : TrackIndex.Frames) {
         auto V = std::make_tuple(Iter.Format, Iter.Width, Iter.Height);
-        if (SeenSets.insert(std::make_pair(V, std::make_tuple(0, 0, 0, &Iter))).second)
+        if (SeenSets.insert(std::make_pair(V, std::make_tuple(0, 0, Iter.PTS, Iter.TFF))).second)
             FormatSets.push_back(FormatSet{ {}, Iter.Format, Iter.Width, Iter.Height });
         std::get<0>(SeenSets[V])++;
         std::get<1>(SeenSets[V]) += Iter.RepeatPict + 2;
@@ -1410,17 +1410,21 @@ void BestVideoSource::InitializeFormatSets() {
     for (auto &Iter : FormatSets) {
         auto V = std::make_tuple(Iter.Format, Iter.Width, Iter.Height);
         Iter.NumFrames = std::get<0>(SeenSets[V]);
-        Iter.NumRFFFrames = (std::get<1>(SeenSets[V]) + 1) / 2;
-        Iter.TFF = std::get<3>(SeenSets[V])->TFF;
-        if (std::get<3>(SeenSets[V])->PTS != AV_NOPTS_VALUE)
-            Iter.StartTime = (static_cast<double>(VP.TimeBase.Num) * std::get<3>(SeenSets[V])->PTS) / VP.TimeBase.Den;      
+        Iter.NumRFFFrames = std::get<1>(SeenSets[V]);
+        Iter.TFF = std::get<3>(SeenSets[V]);
+        if (std::get<2>(SeenSets[V]) != AV_NOPTS_VALUE)
+            Iter.StartTime = (static_cast<double>(VP.TimeBase.Num) * std::get<2>(SeenSets[V])) / VP.TimeBase.Den;      
         Iter.VF.Set(av_pix_fmt_desc_get(static_cast<AVPixelFormat>(Iter.Format)));
     }
 
     DefaultFormatSet = FormatSets[0];
     DefaultFormatSet.NumFrames = TrackIndex.Frames.size();
     DefaultFormatSet.NumRFFFrames = 0;
-    for (const auto &Iter : FormatSets) {
+
+    for (auto &Iter : FormatSets) {
+        DefaultFormatSet.NumRFFFrames += Iter.NumRFFFrames;
+        Iter.NumRFFFrames = (Iter.NumRFFFrames + 1) / 2; // Can't round before adding it together
+
         if (DefaultFormatSet.Format != Iter.Format)
             DefaultFormatSet.Format = AV_PIX_FMT_NONE;
         if (DefaultFormatSet.Width != Iter.Width || DefaultFormatSet.Height != Iter.Height) {
@@ -1428,6 +1432,9 @@ void BestVideoSource::InitializeFormatSets() {
             DefaultFormatSet.Height = 0;
         }
     }
+
+    DefaultFormatSet.NumRFFFrames = (DefaultFormatSet.NumRFFFrames + 1) / 2;
+
     if (DefaultFormatSet.Format != AV_PIX_FMT_NONE)
         DefaultFormatSet.VF.Set(av_pix_fmt_desc_get(static_cast<AVPixelFormat>(DefaultFormatSet.Format)));
     else
diff --git a/src/videosource.h b/src/videosource.h
index 183597d..441ae9e 100644
--- a/src/videosource.h
+++ b/src/videosource.h
@@ -84,7 +84,7 @@ struct LWVideoProperties {
 
 struct BSVideoProperties : public LWVideoProperties {
     BSVideoFormat VF;
-    int Format; // fixme, needed?
+    int Format;
 
     int Width;
     int Height;