From 66fcf72f76f18b164e65b282ffc1738274c4facf Mon Sep 17 00:00:00 2001
From: gitamohr <gitamohr@users.noreply.github.com>
Date: Thu, 7 Dec 2023 09:39:33 -0800
Subject: [PATCH] tf: A token change. Make the 'immortal' bit be part of the
 atomic reference count.  This is required for thread-safety.  Instead of
 eagerly reaping tokens whose reference counts go to zero, sweep for garbage
 when an insert would trigger a rehash.

(Internal change: 2307757)
---
 pxr/base/tf/token.cpp | 201 ++++++++++++++++++++++++------------------
 pxr/base/tf/token.h   | 158 ++++++++++++++++++---------------
 2 files changed, 203 insertions(+), 156 deletions(-)
diff --git a/pxr/base/tf/token.cpp b/pxr/base/tf/token.cpp
index e61c882efe..c1b87a9430 100644
--- a/pxr/base/tf/token.cpp
+++ b/pxr/base/tf/token.cpp
@@ -39,12 +39,14 @@
 
 #include <tbb/spin_mutex.h>
 
-#include <string>
-#include <ostream>
-#include <vector>
+#include <atomic>
+#include <algorithm>
 #include <cstring>
+#include <new>
+#include <ostream>
+#include <string>
 #include <utility>
-#include <algorithm>
+#include <vector>
 
 using std::vector;
 using std::string;
@@ -85,29 +87,23 @@ struct Tf_TokenRegistry
     typedef _Hash<7> _OuterHash;
     typedef _Hash<5> _InnerHash;
 
-    static const size_t _NumSets = 128;
-    static const size_t _SetMask = _NumSets-1;
-    
-    // Utility to pad an instance to take up a whole number of cache lines to
-    // avoid false sharing.
-    template <class T>
-    struct _CacheLinePadded {
-        T val;
-        char _unused_padding[
-            ARCH_CACHE_LINE_SIZE-(sizeof(T) % ARCH_CACHE_LINE_SIZE)];
-    };
+    static constexpr unsigned _NumSets = 128;
+    static constexpr unsigned _SetMask = _NumSets-1;
+    static constexpr size_t _MinInsertsUntilSweepCheck = 32;
+
+    // TODO: Consider a better container than TfHashSet here.
+    using _RepSet = TfHashSet<TfToken::_Rep, _InnerHash, _Eq>;
 
-    // It's not ideal to use TfHashSet here.  It would be better to use
-    // boost::multi_index from boost 1.56 or newer: it lets us lookup an element
-    // by a key more easily.  It also has better performance.  Earlier versions
-    // suffer from the issue where erase(iterator) has to return an iterator to
-    // the next element, which has to scan forward to find the next, so we use
-    // TfHashSet for now.
-    typedef TfHashSet<TfToken::_Rep, _InnerHash, _Eq> _RepSet;
+    // Align this structure to ensure no false sharing of 'mutex'.
+    struct alignas(ARCH_CACHE_LINE_SIZE)
+    _Set {
+        _RepSet reps;
+        unsigned insertsUntilSweepCheck = _MinInsertsUntilSweepCheck;
+        mutable tbb::spin_mutex mutex;
+    };
 
     // Data members.
-    _RepSet _sets[_NumSets];
-    mutable _CacheLinePadded<tbb::spin_mutex> _locks[_NumSets];
+    _Set _sets[_NumSets];
 
     static Tf_TokenRegistry& _GetInstance() {
         return TfSingleton<Tf_TokenRegistry>::GetInstance();
@@ -136,48 +132,26 @@ struct Tf_TokenRegistry
         return _FindPtrImpl<string const &>(s);
     }
 
-    /*
-     * rep may be dying.  remove its key and raw pointer reference
-     * from the table if it truly is.
-     */
-    void _PossiblyDestroyRep(_RepPtr rep) {
-        bool repFoundInSet = true;
-        string repString;
-        {
-            unsigned int setNum = rep->_setNum;
-
-            tbb::spin_mutex::scoped_lock lock(_locks[setNum].val);
-
-            if (!rep->_isCounted)
-                return; 
-            
-            /*
-             * We hold the lock, but there could be others outside the lock
-             * modifying this same counter.  Be safe: be atomic.
-             */
-            if (--rep->_refCount != 0)
-                return;
-            
-            if (!_sets[setNum].erase(*rep)) {
-                repFoundInSet = false;
-                repString = rep->_str;
-            }            
-        }
-        TF_VERIFY(repFoundInSet,
-                  "failed to find token '%s' in table for destruction",
-                  repString.c_str());
-    }
-
     void _DumpStats() const {
         std::vector<std::pair<size_t, size_t> > sizesWithSet;
         for (size_t i = 0; i != _NumSets; ++i) {
-            sizesWithSet.push_back(std::make_pair(_sets[i].size(), i));
+            tbb::spin_mutex::scoped_lock lock(_sets[i].mutex);
+            sizesWithSet.push_back(std::make_pair(_sets[i].reps.size(), i));
         }
         std::sort(sizesWithSet.begin(), sizesWithSet.end());
         printf("Set # -- Size\n");
         TF_FOR_ALL(i, sizesWithSet) {
             printf("%zu -- %zu\n", i->second, i->first);
         }
+
+        // Uncomment to dump every token & refcount.
+        // for (size_t i = 0; i != _NumSets; ++i) {
+        //     tbb::spin_mutex::scoped_lock lock(_sets[i].mutex);
+        //     for (auto const &rep: _sets[i].reps) {
+        //         printf("%d '%s'\n", rep._refCount.load(), rep._str.c_str());
+        //     }
+        // }
+        
     }
 
 private:
@@ -212,53 +186,114 @@ struct Tf_TokenRegistry
      */
     template <class Str>
     inline _RepPtrAndBits _GetPtrImpl(Str s, bool makeImmortal) {
-        if (_IsEmpty(s))
+        if (_IsEmpty(s)) {
             return _RepPtrAndBits();
+        }
 
-        size_t setNum = _GetSetNum(_CStr(s));
+        unsigned setNum = _GetSetNum(_CStr(s));
+        _Set &set = _sets[setNum];
 
-        tbb::spin_mutex::scoped_lock lock(_locks[setNum].val);
+        tbb::spin_mutex::scoped_lock lock(set.mutex);
 
         // Insert or lookup an existing.
-        _RepSet::iterator iter = _sets[setNum].find(_LookupRep(_CStr(s)));
-        if (iter != _sets[setNum].end()) {
+        _RepSet::iterator iter = set.reps.find(_LookupRep(_CStr(s)));
+        if (iter != set.reps.end()) {
             _RepPtr rep = &(*iter);
-            bool isCounted = rep->_isCounted;
+            bool isCounted = rep->_refCount.load(std::memory_order_relaxed) & 1;
             if (isCounted) {
-                if (makeImmortal)
-                    isCounted = rep->_isCounted = false;
-                else
-                    ++rep->_refCount;
+                if (makeImmortal) {
+                    // Clear counted flag.
+                    rep->_refCount.fetch_and(~1u, std::memory_order_relaxed);
+                    isCounted = false;
+                }
+                else {
+                    // Add one reference (we count by 2's).
+                    rep->_refCount.fetch_add(2, std::memory_order_relaxed);
+                }
             }
             return _RepPtrAndBits(rep, isCounted);
         } else {
-            // No entry present, add a new entry.
+            // No entry present, add a new entry.  First check if we should
+            // clean house.
+            if (set.insertsUntilSweepCheck == 0) {
+                // If this insert would cause a rehash, clean house first.
+                // Remove any reps that have refcount zero.  This is safe to do
+                // since we hold a lock on the set's mutex: no other thread can
+                // observe or manipulate this set concurrently.
+                if ((float(set.reps.size() + 1) /
+                     float(set.reps.bucket_count()) >
+                     set.reps.max_load_factor())) {
+                    // Walk the set and erase any non-immortal reps with no
+                    // outstanding refcounts (_refCount == 1).  Recall that the
+                    // low-order bit indicates counted (non-immortal) reps, we
+                    // count references by 2.
+                    for (auto iter = set.reps.begin(),
+                             end = set.reps.end(); iter != end; ) {
+                        if (iter->_refCount == 1) {
+                            set.reps.erase(iter++);
+                        }
+                        else {
+                            ++iter;
+                        }
+                    }
+                    // Reset insertsUntilSweepCheck.  We want to wait at least a
+                    // bit to avoid bad patterns.  For example if we sweep the
+                    // table and find one garbage token, we will remove it and
+                    // then insert the new token.  On the next insert we'll
+                    // sweep for garbage again.  If the calling code is
+                    // consuming a list of tokens, discarding one and creating a
+                    // new one on each iteration, we could end up sweeping the
+                    // whole table like this on every insertion.  By waiting a
+                    // bit we avoid these situations at the cost of the
+                    // occasional "unnecessary" rehash.
+                    set.insertsUntilSweepCheck = std::max(
+                        _MinInsertsUntilSweepCheck,
+                        static_cast<size_t>(
+                            set.reps.bucket_count() *
+                            float(set.reps.max_load_factor() -
+                                  set.reps.load_factor())));
+                }
+            }
+            else {
+                --set.insertsUntilSweepCheck;
+            }
+                
+            // Create the new entry.
             TfAutoMallocTag noname("TfToken");
-            _RepPtr rep = &(*_sets[setNum].insert(TfToken::_Rep(s)).first);
-            rep->_isCounted = !makeImmortal;
-            rep->_setNum = setNum;
-            rep->_compareCode = _ComputeCompareCode(rep->_cstr);
-            if (!makeImmortal)
-                rep->_refCount = 1;
+            uint64_t compareCode = _ComputeCompareCode(_CStr(s));
+            _RepPtr rep = &(*set.reps.insert(
+                                TfToken::_Rep(s, setNum, compareCode)).first);
+            // 3, because 1 for counted bit plus 2 for one refcount.  We can
+            // store relaxed here since our lock on the set's mutex provides
+            // synchronization.
+            rep->_refCount.store(makeImmortal ? 0 : 3,
+                                 std::memory_order_relaxed);
             return _RepPtrAndBits(rep, !makeImmortal);
         }
     }
 
     template <class Str>
     _RepPtrAndBits _FindPtrImpl(Str s) const {
-        if (_IsEmpty(s))
+        if (_IsEmpty(s)) {
             return _RepPtrAndBits();
+        }
         
         size_t setNum = _GetSetNum(_CStr(s));
+        _Set const &set = _sets[setNum];
 
-        tbb::spin_mutex::scoped_lock lock(_locks[setNum].val);
+        tbb::spin_mutex::scoped_lock lock(set.mutex);
 
-        _RepSet::const_iterator iter =
-            _sets[setNum].find(_LookupRep(_CStr(s)));
-        if (iter == _sets[setNum].end())
+        _RepSet::const_iterator iter = set.reps.find(_LookupRep(_CStr(s)));
+        if (iter == set.reps.end()) {
             return _RepPtrAndBits();
+        }
+
+        TfToken::_Rep const *rep = &(*iter);
+        
+        unsigned oldVal =
+            rep->_refCount.fetch_add(2, std::memory_order_relaxed);
 
-        return _RepPtrAndBits(&(*iter), iter->IncrementIfCounted());
+        return _RepPtrAndBits(rep, oldVal & 1);
     }
 
 };
@@ -272,12 +307,6 @@ TF_REGISTRY_FUNCTION(TfType)
         .Alias( TfType::GetRoot(), "vector<TfToken>" );
 }
 
-void
-TfToken::_PossiblyDestroyRep() const
-{
-    Tf_TokenRegistry::_GetInstance()._PossiblyDestroyRep(_rep.Get());
-}
-
 string const&
 TfToken::_GetEmptyString()
 {
diff --git a/pxr/base/tf/token.h b/pxr/base/tf/token.h
index 31fdef19c3..f61dcfc86b 100644
--- a/pxr/base/tf/token.h
+++ b/pxr/base/tf/token.h
@@ -90,7 +90,7 @@ class TfToken
     enum _ImmortalTag { Immortal };
 
     /// Create the empty token, containing the empty string.
-    constexpr TfToken() noexcept {}
+    constexpr TfToken() noexcept = default;
 
     /// Copy constructor.
     TfToken(TfToken const& rhs) noexcept : _rep(rhs._rep) { _AddRef(); }
@@ -269,13 +269,16 @@ class TfToken
     /// Allows \c TfToken to be used in \c std::set
     inline bool operator<(TfToken const& r) const {
         auto ll = _rep.GetLiteral(), rl = r._rep.GetLiteral();
-        if (ll && rl) {
-            auto lrep = _rep.Get(), rrep = r._rep.Get();
-            uint64_t lcc = lrep->_compareCode, rcc = rrep->_compareCode;
-            return lcc < rcc || (lcc == rcc && lrep->_str < rrep->_str);
+        if (!ll || !rl) {
+            // One or both are zero -- return true if ll is zero and rl is not.
+            return !ll && rl;
         }
-        // One or both are zero -- return true if ll is zero and rl is not.
-        return !ll && rl;
+        if (ll == rl) {
+            return false;
+        }
+        auto lrep = _rep.Get(), rrep = r._rep.Get();
+        uint64_t lcc = lrep->_compareCode, rcc = rrep->_compareCode;
+        return lcc < rcc || (lcc == rcc && lrep->_str < rrep->_str);
     }
 
     /// Greater-than operator that compares tokenized strings lexicographically.
@@ -301,8 +304,23 @@ class TfToken
     /// Returns \c true iff this token contains the empty string \c ""
     bool IsEmpty() const { return _rep.GetLiteral() == 0; }
 
-    /// Returns \c true iff this is an immortal token.
-    bool IsImmortal() const { return !_rep->_isCounted; }
+    /// Returns \c true iff this is an immortal token.  Note that a return of \c
+    /// false could be instantly stale if another thread races to immortalize
+    /// this token.  A return of \c true is always valid since tokens cannot
+    /// lose immortality.
+    bool IsImmortal() const {
+        if (!_rep.BitsAs<bool>()) {
+            return true;
+        }
+        // There is no synchronization or ordering constraints between this read
+        // and other reads/writes, so relaxed memory order suffices.
+        bool immortal = !(_rep->_refCount.load(std::memory_order_relaxed) & 1);
+        if (immortal) {
+            // Our belief is wrong, update our cache of countedness.
+            _rep.SetBits(false);
+        }
+        return immortal;
+    }
 
     /// Stream insertion.
     friend TF_API std::ostream &operator <<(std::ostream &stream, TfToken const&);
@@ -321,88 +339,88 @@ class TfToken
     }
 
     void _AddRef() const {
-        if (_rep.BitsAs<bool>()) {
-            // We believe this rep is refCounted.
-            if (!_rep->IncrementIfCounted()) {
-                // Our belief is wrong, update our cache of countedness.
-                _rep.SetBits(false);
-            }
+        if (!_rep.BitsAs<bool>()) {
+            // Not counted, do nothing.
+            return;
+        }
+        // We believe this rep is refCounted.
+        if (!_rep->IncrementAndCheckCounted()) {
+            // Our belief is wrong, update our cache of countedness.
+            _rep.SetBits(false);
         }
     }
 
     void _RemoveRef() const {
-        if (_rep.BitsAs<bool>()) {
-            // We believe this rep is refCounted.
-            if (_rep->_isCounted) {
-                if (_rep->_refCount.load(std::memory_order_relaxed) == 1) {
-                    _PossiblyDestroyRep();
-                }
-                else {
-                    /*
-                     * This is deliberately racy.  It's possible the statement
-                     * below drops our count to zero, and we leak the rep
-                     * (i.e. we leave it in the table).  That's a low
-                     * probability event, in exchange for only grabbing the lock
-                     * (in _PossiblyDestroyRep()) when the odds are we really do
-                     * need to modify the table.
-                     *
-                     * Note that even if we leak the rep, if we look it up
-                     * again, we'll simply repull it from the table and keep
-                     * using it.  So it's not even necessarily a true leak --
-                     * it's just a potential leak.
-                     */
-                    _rep->_refCount.fetch_sub(1, std::memory_order_relaxed);
-                }
-            } else {
-                // Our belief is wrong, update our cache of countedness.
-                _rep.SetBits(false);
-            }
+        if (!_rep.BitsAs<bool>()) {
+            // Not counted, do nothing.
+            return;
+        }
+        // We believe this rep is refCounted.
+        if (!_rep->DecrementAndCheckCounted()) {
+            // Our belief is wrong, update our cache of countedness.
+            _rep.SetBits(false);
         }
     }
-    
-    void TF_API _PossiblyDestroyRep() const;
-    
+
     struct _Rep {
-        _Rep() {}
-        explicit _Rep(char const *s) : _str(s), _cstr(_str.c_str()) {}
-        explicit _Rep(std::string const &s) : _str(s), _cstr(_str.c_str()) {}
+        _Rep() = default;
+
+        explicit _Rep(std::string &&str,
+                      unsigned setNum,
+                      uint64_t compareCode)
+            : _setNum(setNum)
+            , _compareCode(compareCode)
+            , _str(std::move(str))
+            , _cstr(_str.c_str()) {}
+
+        explicit _Rep(std::string const &str,
+                      unsigned setNum,
+                      uint64_t compareCode)
+            : _Rep(std::string(str), setNum, compareCode) {}
+
+        explicit _Rep(char const *str,
+                      unsigned setNum,
+                      uint64_t compareCode)
+            : _Rep(std::string(str), setNum, compareCode) {}
 
         // Make sure we reacquire _cstr from _str on copy and assignment
         // to avoid holding on to a dangling pointer. However, if rhs'
         // _cstr member doesn't come from its _str, just copy it directly
         // over. This is to support lightweight _Rep objects used for 
         // internal lookups.
-        _Rep(_Rep const &rhs) : _str(rhs._str), 
-                                _cstr(rhs._str.c_str() != rhs._cstr ? 
-                                          rhs._cstr : _str.c_str()),
-                                _compareCode(rhs._compareCode),
-                                _refCount(rhs._refCount.load()),
-                                _isCounted(rhs._isCounted), 
-                                _setNum(rhs._setNum) {}
-        _Rep& operator=(_Rep const &rhs) {
-            _str = rhs._str;
-            _cstr = (rhs._str.c_str() != rhs._cstr ? rhs._cstr : _str.c_str());
-            _compareCode = rhs._compareCode;
-            _refCount = rhs._refCount.load();
-            _isCounted = rhs._isCounted;
+        _Rep(_Rep const &rhs)
+            : _refCount(rhs._refCount.load(std::memory_order_relaxed))
+            , _setNum(rhs._setNum)
+            , _compareCode(rhs._compareCode)
+            , _str(rhs._str)
+            , _cstr(rhs._str.c_str() == rhs._cstr ? _str.c_str() : rhs._cstr) {}
+
+        _Rep &operator=(_Rep const &rhs) {
+            _refCount = rhs._refCount.load(std::memory_order_relaxed);
             _setNum = rhs._setNum;
+            _compareCode = rhs._compareCode;
+            _str = rhs._str;
+            _cstr = (rhs._str.c_str() == rhs._cstr ? _str.c_str() : rhs._cstr);
             return *this;
         }
 
-        inline bool IncrementIfCounted() const {
-            const bool isCounted = _isCounted;
-            if (isCounted) {
-                _refCount.fetch_add(1, std::memory_order_relaxed);
-            }
-            return isCounted;
+        inline bool IncrementAndCheckCounted() const {
+            // Refcounts are manipulated by add/sub 2, since the lowest-order
+            // bit indicates whether or not the rep is counted.
+            return _refCount.fetch_add(2, std::memory_order_relaxed) & 1;
+        }
+
+        inline bool DecrementAndCheckCounted() const {
+            // Refcounts are manipulated by add/sub 2, since the lowest-order
+            // bit indicates whether or not the rep is counted.
+            return _refCount.fetch_sub(2, std::memory_order_release) & 1;
         }
 
+        mutable std::atomic_uint _refCount;
+        unsigned _setNum;
+        uint64_t _compareCode;
         std::string _str;
         char const *_cstr;
-        mutable uint64_t _compareCode;
-        mutable std::atomic_int _refCount;
-        mutable bool _isCounted;
-        mutable unsigned char _setNum;
     };
 
     friend struct TfTokenFastArbitraryLessThan;