lancedb · eddyxu · Jan 16, 2024 · Jan 14, 2024 · Jan 14, 2024 · Jan 14, 2024
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
@@ -337,8 +337,7 @@ def test_pre_populated_ivf_centroids(dataset, tmp_path: Path):
     if platform.system() == "Windows":
         expected_filepath = expected_filepath.replace("\\", "/")
     expected_statistics = {
-        "index_cache_entry_count": 1,
-        "index_cache_hit_rate": 0,
+        "index_cache_entry_count": 2,
         "index_type": "IVF",
         "uuid": index_uuid,
         "uri": expected_filepath,
@@ -356,15 +355,20 @@ def test_pre_populated_ivf_centroids(dataset, tmp_path: Path):
     }
 
     with pytest.raises(KeyError, match='Index "non-existent_idx" not found'):
+        # increase 1 miss of index_cache.metadata_cache
         assert dataset_with_index.stats.index_stats("non-existent_idx")
     with pytest.raises(KeyError, match='Index "" not found'):
+        # increase 1 miss of index_cache.metadata_cache
         assert dataset_with_index.stats.index_stats("")
     with pytest.raises(TypeError):
         dataset_with_index.stats.index_stats()
 
+    # increase 1 hit of index_cache.metadata_cache
     actual_statistics = dataset_with_index.stats.index_stats("vector_idx")
     partitions = actual_statistics.pop("partitions")
+    hit_rate = actual_statistics.pop("index_cache_hit_rate")
     assert actual_statistics == expected_statistics
+    assert np.isclose(hit_rate, 7 / 11)
 
     assert len(partitions) == 5
     partition_keys = {"index", "length", "offset", "centroid"}
@@ -534,24 +538,24 @@ def query_index(ds, ntimes):
     )
 
     assert (
-        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 1
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 2
     )
     query_index(indexed_dataset, 1)
     assert (
-        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 2
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 3
     )
-    assert (
-        indexed_dataset.stats.index_stats("vector_idx")["index_cache_hit_rate"] == 0.5
+    assert np.isclose(
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_hit_rate"], 18 / 25
     )
     query_index(indexed_dataset, 128)
     assert (
-        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 10
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 11
     )
 
     indexed_dataset = lance.LanceDataset(indexed_dataset.uri, index_cache_size=5)
     query_index(indexed_dataset, 128)
     assert (
-        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 5
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 6
     )
 
 

diff --git a/rust/lance-index/src/traits.rs b/rust/lance-index/src/traits.rs
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use async_trait::async_trait;
+use std::sync::Arc;
 
+use async_trait::async_trait;
 use lance_core::{format::Index, Result};
 
 use crate::{IndexParams, IndexType};
@@ -44,23 +45,29 @@ pub trait DatasetIndexExt {
     ) -> Result<()>;
 
     /// Read all indices of this Dataset version.
-    async fn load_indices(&self) -> Result<Vec<Index>>;
+    ///
+    /// The indices are lazy loaded and cached in memory within the [`Dataset`] instance.
+    /// The cache is invalidated when the dataset version (Manifest) is changed.
+    async fn load_indices(&self) -> Result<Arc<Vec<Index>>>;
 
     /// Loads all the indies of a given UUID.
     ///
     /// Note that it is possible to have multiple indices with the same UUID,
     /// as they are the deltas of the same index.
     async fn load_index(&self, uuid: &str) -> Result<Option<Index>> {
-        self.load_indices()
-            .await
-            .map(|indices| indices.into_iter().find(|idx| idx.uuid.to_string() == uuid))
+        self.load_indices().await.map(|indices| {
+            indices
+                .iter()
+                .find(|idx| idx.uuid.to_string() == uuid)
+                .cloned()
+        })
     }
 
     /// Loads a specific index with the given index name
     async fn load_index_by_name(&self, name: &str) -> Result<Option<Index>> {
         self.load_indices()
             .await
-            .map(|indices| indices.into_iter().find(|idx| idx.name == name))
+            .map(|indices| indices.iter().find(|idx| idx.name == name).cloned())
     }
 
     /// Loads a specific index with the given index name.

diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
@@ -74,11 +74,11 @@ use crate::error::box_error;
 use crate::format::{Fragment, Index, Manifest};
 use crate::io::commit::{commit_new_dataset, commit_transaction};
 use crate::session::Session;
-
 use crate::utils::temporal::{timestamp_to_nanos, utc_now, SystemTime};
 use crate::{Error, Result};
 use hash_joiner::HashJoiner;
 pub use lance_core::ROW_ID;
+
 pub use write::update::{UpdateBuilder, UpdateJob};
 pub use write::{write_fragments, WriteMode, WriteParams};
 
@@ -3724,10 +3724,10 @@ mod tests {
         // Any transaction, no matter how simple, should trigger the fragment bitmap to be recalculated
         dataset.append(data, None).await.unwrap();
 
-        for idx in dataset.load_indices().await.unwrap() {
+        for idx in dataset.load_indices().await.unwrap().iter() {
             // The corrupt fragment_bitmap does not contain 0 but the
             // restored one should
-            assert!(idx.fragment_bitmap.unwrap().contains(0));
+            assert!(idx.fragment_bitmap.as_ref().unwrap().contains(0));
         }
 
         let mut dataset = dataset.checkout_version(broken_version).await.unwrap();
@@ -3739,8 +3739,8 @@ mod tests {
             .await
             .unwrap();
 
-        for idx in dataset.load_indices().await.unwrap() {
-            assert!(idx.fragment_bitmap.unwrap().contains(0));
+        for idx in dataset.load_indices().await.unwrap().iter() {
+            assert!(idx.fragment_bitmap.as_ref().unwrap().contains(0));
         }
 
         let mut scan = dataset.scan();

diff --git a/rust/lance/src/dataset/index.rs b/rust/lance/src/dataset/index.rs
@@ -68,15 +68,15 @@ impl IndexRemapper for DatasetIndexRemapper {
         let affected_frag_ids = HashSet::<u64>::from_iter(affected_fragment_ids.iter().copied());
         let indices = self.dataset.load_indices().await?;
         let mut remapped = Vec::with_capacity(indices.len());
-        for index in indices {
+        for index in indices.iter() {
             let needs_remapped = match &index.fragment_bitmap {
                 None => true,
                 Some(fragment_bitmap) => fragment_bitmap
                     .iter()
                     .any(|frag_idx| affected_frag_ids.contains(&(frag_idx as u64))),
             };
             if needs_remapped {
-                remapped.push(self.remap_index(&index, &mapping).await?);
+                remapped.push(self.remap_index(index, &mapping).await?);
             }
         }
         Ok(remapped)

diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs
@@ -474,8 +474,8 @@ impl CandidateBin {
 async fn load_index_fragmaps(dataset: &Dataset) -> Result<Vec<RoaringBitmap>> {
     let indices = dataset.load_indices().await?;
     let mut index_fragmaps = Vec::with_capacity(indices.len());
-    for index in indices {
-        if let Some(fragment_bitmap) = index.fragment_bitmap {
+    for index in indices.iter() {
+        if let Some(fragment_bitmap) = index.fragment_bitmap.as_ref() {
             index_fragmaps.push(fragment_bitmap.clone());
         } else {
             let dataset_at_index = dataset.checkout_version(index.dataset_version).await?;

diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs
@@ -851,7 +851,7 @@ impl Scanner {
         let indices = if use_index {
             self.dataset.load_indices().await?
         } else {
-            vec![]
+            Arc::new(vec![])
         };
         let knn_idx = indices.iter().find(|i| i.fields.contains(&column_id));
         if let Some(index) = knn_idx {
@@ -2499,7 +2499,10 @@ mod test {
             scan.refine(100);
             scan.nprobs(100);
 
-            assert_eq!(dataset.index_cache_entry_count(), 0);
+            assert_eq!(
+                dataset.index_cache_entry_count(),
+                1, // 1 for index metadata
+            );
             let results = scan
                 .try_into_stream()
                 .await
@@ -2508,7 +2511,10 @@ mod test {
                 .await
                 .unwrap();
 
-            assert_eq!(dataset.index_cache_entry_count(), 5);
+            assert_eq!(
+                dataset.index_cache_entry_count(),
+                5 + dataset.versions().await.unwrap().len()
+            );
             assert_eq!(results.len(), 1);
             let batch = &results[0];
 

diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs
@@ -252,16 +252,35 @@ impl DatasetIndexExt for Dataset {
         Ok(())
     }
 
-    async fn load_indices(&self) -> Result<Vec<IndexMetadata>> {
+    async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>> {
+        let dataset_dir = self.base.to_string();
+        if let Some(indices) = self
+            .session
+            .index_cache
+            .get_metadata(&dataset_dir, self.version().version)
+        {
+            return Ok(indices);
+        }
+
         let manifest_file = self.manifest_file(self.version().version).await?;
-        read_manifest_indexes(&self.object_store, &manifest_file, &self.manifest).await
+        let loaded_indices: Arc<Vec<IndexMetadata>> =
+            read_manifest_indexes(&self.object_store, &manifest_file, &self.manifest)
+                .await?
+                .into();
+
+        self.session.index_cache.insert_metadata(
+            &dataset_dir,
+            self.version().version,
+            loaded_indices.clone(),
+        );
+        Ok(loaded_indices)
     }
 
     async fn load_scalar_index_for_column(&self, col: &str) -> Result<Option<IndexMetadata>> {
         Ok(self
             .load_indices()
             .await?
-            .into_iter()
+            .iter()
             .filter(|idx| idx.fields.len() == 1)
             .find(|idx| {
                 let field = self.schema().field_by_id(idx.fields[0]);
@@ -270,7 +289,8 @@ impl DatasetIndexExt for Dataset {
                 } else {
                     false
                 }
-            }))
+            })
+            .cloned())
     }
 
     #[instrument(skip_all)]

diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs
@@ -246,6 +246,9 @@ mod tests {
             .iter()
             .sum::<usize>();
         assert_eq!(row_in_index, 2000);
-        assert_eq!(dataset.index_cache_entry_count(), 6)
+        assert_eq!(
+            dataset.index_cache_entry_count(),
+            6 + dataset.versions().await.unwrap().len()
+        );
     }
 }
diff --git a/rust/lance/src/index/cache.rs b/rust/lance/src/index/cache.rs
@@ -14,6 +14,7 @@
 
 use std::sync::Arc;
 
+use lance_core::format::Index;
 use lance_index::scalar::ScalarIndex;
 use moka::sync::{Cache, ConcurrentCacheExt};
 
@@ -22,25 +23,33 @@ use super::vector::VectorIndex;
 use std::sync::atomic::{AtomicU64, Ordering};
 
 #[derive(Debug, Default)]
-pub struct CacheStats {
+struct CacheStats {
     hits: AtomicU64,
     misses: AtomicU64,
 }
 
 impl CacheStats {
-    pub fn record_hit(&self) {
+    fn record_hit(&self) {
         self.hits.fetch_add(1, Ordering::Relaxed);
     }
 
-    pub fn record_miss(&self) {
+    fn record_miss(&self) {
         self.misses.fetch_add(1, Ordering::Relaxed);
     }
 }
 
 #[derive(Clone)]
 pub struct IndexCache {
+    // TODO: Can we merge these two caches into one for uniform memory management?
     scalar_cache: Arc<Cache<String, Arc<dyn ScalarIndex>>>,
     vector_cache: Arc<Cache<String, Arc<dyn VectorIndex>>>,
+
+    /// Index metadata cache.
+    ///
+    /// The key is "{dataset_base_path}:{version}".
+    /// Value is all the indies of a particular version of the dataset.
+    metadata_cache: Arc<Cache<String, Arc<Vec<Index>>>>,
+
     cache_stats: Arc<CacheStats>,
 }
 
@@ -49,6 +58,7 @@ impl IndexCache {
         Self {
             scalar_cache: Arc::new(Cache::new(capacity as u64)),
             vector_cache: Arc::new(Cache::new(capacity as u64)),
+            metadata_cache: Arc::new(Cache::new(capacity as u64)),
             cache_stats: Arc::new(CacheStats::default()),
         }
     }
@@ -62,21 +72,31 @@ impl IndexCache {
     pub(crate) fn get_size(&self) -> usize {
         self.scalar_cache.sync();
         self.vector_cache.sync();
-        self.scalar_cache.entry_count() as usize + self.vector_cache.entry_count() as usize
+        self.metadata_cache.sync();
+        (self.scalar_cache.entry_count()
+            + self.vector_cache.entry_count()
+            + self.metadata_cache.entry_count()) as usize
     }
 
     /// Get an Index if present. Otherwise returns [None].
     pub(crate) fn get_scalar(&self, key: &str) -> Option<Arc<dyn ScalarIndex>> {
-        self.scalar_cache.get(key)
+        if let Some(index) = self.scalar_cache.get(key) {
+            self.cache_stats.record_hit();
+            Some(index)
+        } else {
+            self.cache_stats.record_miss();
+            None
+        }
     }
 
     pub(crate) fn get_vector(&self, key: &str) -> Option<Arc<dyn VectorIndex>> {
-        if self.vector_cache.contains_key(key) || self.scalar_cache.contains_key(key) {
+        if let Some(index) = self.vector_cache.get(key) {
             self.cache_stats.record_hit();
+            Some(index)
         } else {
             self.cache_stats.record_miss();
+            None
         }
-        self.vector_cache.get(key)
     }
 
     /// Insert a new entry into the cache.
@@ -88,15 +108,39 @@ impl IndexCache {
         self.vector_cache.insert(key.to_string(), index);
     }
 
+    /// Construct a key for index metadata arrays.
+    fn metadata_key(dataset_uuid: &str, version: u64) -> String {
+        format!("{}:{}", dataset_uuid, version)
+    }
+
+    /// Get all index metadata for a particular dataset version.
+    pub(crate) fn get_metadata(&self, key: &str, version: u64) -> Option<Arc<Vec<Index>>> {
+        let key = Self::metadata_key(key, version);
+        if let Some(indices) = self.metadata_cache.get(&key) {
+            self.cache_stats.record_hit();
+            Some(indices)
+        } else {
+            self.cache_stats.record_miss();
+            None
+        }
+    }
+
+    pub(crate) fn insert_metadata(&self, key: &str, version: u64, indices: Arc<Vec<Index>>) {
+        let key = Self::metadata_key(key, version);
+
+        self.metadata_cache.insert(key, indices);
+    }
+
     /// Get cache hit ratio.
     #[allow(dead_code)]
     pub(crate) fn hit_rate(&self) -> f32 {
         let hits = self.cache_stats.hits.load(Ordering::Relaxed) as f32;
         let misses = self.cache_stats.misses.load(Ordering::Relaxed) as f32;
         // Returns 1.0 if hits + misses == 0 and avoids division by zero.
         if (hits + misses) == 0.0 {
-            return 1.0;
+            1.0
+        } else {
+            hits / (hits + misses)
         }
-        hits / (hits + misses)
     }
 }