From d02b462eee65eed5417906e6e8830e7629bb22c1 Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@crate.io>
Date: Tue, 21 Nov 2023 13:12:21 +0100
Subject: [PATCH 1/3] CrateDB vector: Add CrateDBVectorSearchMultiCollection

It is a special adapter which provides similarity search across multiple
collections. It can not be used for indexing documents.
---
 docs/docs/integrations/providers/cratedb.mdx  |   3 +
 .../integrations/vectorstores/cratedb.ipynb   |  88 +++++++-----
 .../vectorstores/cratedb/__init__.py          |   2 +
 .../langchain/vectorstores/cratedb/base.py    |  22 ++-
 .../vectorstores/cratedb/extended.py          |  92 ++++++++++++
 .../langchain/vectorstores/cratedb/model.py   |  15 +-
 .../vectorstores/fake_embeddings.py           |   1 -
 .../vectorstores/test_cratedb.py              | 131 +++++++++++++-----
 8 files changed, 281 insertions(+), 73 deletions(-)
 create mode 100644 libs/langchain/langchain/vectorstores/cratedb/extended.py

diff --git a/docs/docs/integrations/providers/cratedb.mdx b/docs/docs/integrations/providers/cratedb.mdx
index 5472f875f05da..4764a7ad92369 100644
--- a/docs/docs/integrations/providers/cratedb.mdx
+++ b/docs/docs/integrations/providers/cratedb.mdx
@@ -106,6 +106,9 @@ export OPENAI_API_KEY=foobar  # FIXME
 export CRATEDB_CONNECTION_STRING=crate://crate@localhost
 ```
 
+### Example
+
+Load and index documents, and invoke query.
 ```python
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
diff --git a/docs/docs/integrations/vectorstores/cratedb.ipynb b/docs/docs/integrations/vectorstores/cratedb.ipynb
index 462e721bfff40..06430e6355ae9 100644
--- a/docs/docs/integrations/vectorstores/cratedb.ipynb
+++ b/docs/docs/integrations/vectorstores/cratedb.ipynb
@@ -182,7 +182,11 @@
   {
    "cell_type": "markdown",
    "source": [
-    "Next, you will read input data, and tokenize it."
+    "## Load and Index Documents\n",
+    "\n",
+    "Next, you will read input data, and tokenize it. The module will create a table\n",
+    "with the name of the collection. Make sure the collection name is unique, and\n",
+    "that you have the permission to create a table."
    ],
    "metadata": {
     "collapsed": false
@@ -196,7 +200,18 @@
     "loader = UnstructuredURLLoader(\"https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt\")\n",
     "documents = loader.load()\n",
     "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
-    "docs = text_splitter.split_documents(documents)"
+    "docs = text_splitter.split_documents(documents)\n",
+    "\n",
+    "COLLECTION_NAME = \"state_of_the_union_test\"\n",
+    "\n",
+    "embeddings = OpenAIEmbeddings()\n",
+    "\n",
+    "db = CrateDBVectorSearch.from_documents(\n",
+    "    embedding=embeddings,\n",
+    "    documents=docs,\n",
+    "    collection_name=COLLECTION_NAME,\n",
+    "    connection_string=CONNECTION_STRING,\n",
+    ")"
    ],
    "metadata": {
     "collapsed": false,
@@ -208,39 +223,15 @@
   {
    "cell_type": "markdown",
    "source": [
-    "## Similarity Search with Euclidean Distance (Default)\n",
+    "## Search Documents\n",
     "\n",
-    "The module will create a table with the name of the collection. Make sure\n",
-    "the collection name is unique and that you have the permission to create\n",
-    "a table."
+    "### Similarity Search with Euclidean Distance\n",
+    "Searching by euclidean distance is the default."
    ],
    "metadata": {
     "collapsed": false
    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-09T08:04:16.696625Z",
-     "start_time": "2023-09-09T08:02:31.817790Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "COLLECTION_NAME = \"state_of_the_union_test\"\n",
-    "\n",
-    "embeddings = OpenAIEmbeddings()\n",
-    "\n",
-    "db = CrateDBVectorSearch.from_documents(\n",
-    "    embedding=embeddings,\n",
-    "    documents=docs,\n",
-    "    collection_name=COLLECTION_NAME,\n",
-    "    connection_string=CONNECTION_STRING,\n",
-    ")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -277,7 +268,7 @@
   {
    "cell_type": "markdown",
    "source": [
-    "## Maximal Marginal Relevance Search (MMR)\n",
+    "### Maximal Marginal Relevance Search (MMR)\n",
     "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents."
    ],
    "metadata": {
@@ -318,11 +309,40 @@
     }
    }
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Searching in Multiple Collections\n",
+    "`CrateDBVectorSearchMultiCollection` is a special adapter which provides similarity search across\n",
+    "multiple collections. It can not be used for indexing documents."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from langchain.vectorstores.cratedb import CrateDBVectorSearchMultiCollection\n",
+    "\n",
+    "multisearch = CrateDBVectorSearchMultiCollection(\n",
+    "    collection_names=[\"test_collection_1\", \"test_collection_2\"],\n",
+    "    embedding_function=embeddings,\n",
+    "    connection_string=CONNECTION_STRING,\n",
+    ")\n",
+    "docs_with_score = multisearch.similarity_search_with_score(query)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Working with the vector store\n",
+    "## Working with the Vector Store\n",
     "\n",
     "In the example above, you created a vector store from scratch. When\n",
     "aiming to work with an existing vector store, you can initialize it directly."
@@ -345,7 +365,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Add documents\n",
+    "### Add Documents\n",
     "\n",
     "You can also add documents to an existing vector store."
    ]
@@ -390,7 +410,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Overwriting a vector store\n",
+    "### Overwriting a Vector Store\n",
     "\n",
     "If you have an existing collection, you can overwrite it by using `from_documents`,\n",
     "aad setting `pre_delete_collection = True`."
@@ -433,7 +453,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Using a vector store as a retriever"
+    "### Using a Vector Store as a Retriever"
    ]
   },
   {
diff --git a/libs/langchain/langchain/vectorstores/cratedb/__init__.py b/libs/langchain/langchain/vectorstores/cratedb/__init__.py
index 14b02ad126867..62462bce1eba9 100644
--- a/libs/langchain/langchain/vectorstores/cratedb/__init__.py
+++ b/libs/langchain/langchain/vectorstores/cratedb/__init__.py
@@ -1,5 +1,7 @@
 from .base import CrateDBVectorSearch
+from .extended import CrateDBVectorSearchMultiCollection
 
 __all__ = [
     "CrateDBVectorSearch",
+    "CrateDBVectorSearchMultiCollection",
 ]
diff --git a/libs/langchain/langchain/vectorstores/cratedb/base.py b/libs/langchain/langchain/vectorstores/cratedb/base.py
index ec3c4c19d70a6..922ba2ed659d6 100644
--- a/libs/langchain/langchain/vectorstores/cratedb/base.py
+++ b/libs/langchain/langchain/vectorstores/cratedb/base.py
@@ -250,8 +250,26 @@ def _query_collection(
             collection = self.get_collection(session)
             if not collection:
                 raise ValueError("Collection not found")
+            return self._query_collection_multi(
+                collections=[collection], embedding=embedding, k=k, filter=filter
+            )
 
-            filter_by = self.EmbeddingStore.collection_id == collection.uuid
+    def _query_collection_multi(
+        self,
+        collections: List[Any],
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[Dict[str, str]] = None,
+    ) -> List[Any]:
+        """Query the collection."""
+        self._init_models(embedding)
+
+        collection_names = [coll.name for coll in collections]
+        collection_uuids = [coll.uuid for coll in collections]
+        self.logger.info(f"Querying collections: {collection_names}")
+
+        with self.Session() as session:
+            filter_by = self.EmbeddingStore.collection_id.in_(collection_uuids)
 
             if filter is not None:
                 filter_clauses = []
@@ -271,7 +289,7 @@ def _query_collection(
                         )  # type: ignore[assignment]
                         filter_clauses.append(filter_by_metadata)
 
-                filter_by = sqlalchemy.and_(filter_by, *filter_clauses)
+                filter_by = sqlalchemy.and_(filter_by, *filter_clauses)  # type: ignore[assignment]
 
             _type = self.EmbeddingStore
 
diff --git a/libs/langchain/langchain/vectorstores/cratedb/extended.py b/libs/langchain/langchain/vectorstores/cratedb/extended.py
new file mode 100644
index 0000000000000..9266438787368
--- /dev/null
+++ b/libs/langchain/langchain/vectorstores/cratedb/extended.py
@@ -0,0 +1,92 @@
+import logging
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+)
+
+import sqlalchemy
+from sqlalchemy.orm import sessionmaker
+
+from langchain.schema.embeddings import Embeddings
+from langchain.vectorstores.cratedb.base import (
+    DEFAULT_DISTANCE_STRATEGY,
+    CrateDBVectorSearch,
+    DistanceStrategy,
+)
+from langchain.vectorstores.pgvector import _LANGCHAIN_DEFAULT_COLLECTION_NAME
+
+
+class CrateDBVectorSearchMultiCollection(CrateDBVectorSearch):
+    """
+    Provide functionality for searching multiple collections.
+    It can not be used for indexing documents.
+
+    To use it, you should have the ``crate[sqlalchemy]`` Python package installed.
+
+    Synopsis::
+
+        from langchain.vectorstores.cratedb import CrateDBVectorSearchMultiCollection
+
+        multisearch = CrateDBVectorSearchMultiCollection(
+            collection_names=["collection_foo", "collection_bar"],
+            embedding_function=embeddings,
+            connection_string=CONNECTION_STRING,
+        )
+        docs_with_score = multisearch.similarity_search_with_score(query)
+    """
+
+    def __init__(
+        self,
+        connection_string: str,
+        embedding_function: Embeddings,
+        collection_names: List[str] = [_LANGCHAIN_DEFAULT_COLLECTION_NAME],
+        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,  # type: ignore[arg-type]
+        logger: Optional[logging.Logger] = None,
+        relevance_score_fn: Optional[Callable[[float], float]] = None,
+        *,
+        connection: Optional[sqlalchemy.engine.Connection] = None,
+        engine_args: Optional[dict[str, Any]] = None,
+    ) -> None:
+        self.connection_string = connection_string
+        self.embedding_function = embedding_function
+        self.collection_names = collection_names
+        self._distance_strategy = distance_strategy  # type: ignore[assignment]
+        self.logger = logger or logging.getLogger(__name__)
+        self.override_relevance_score_fn = relevance_score_fn
+        self.engine_args = engine_args or {}
+        # Create a connection if not provided, otherwise use the provided connection
+        self._engine = self.create_engine()
+        self.Session = sessionmaker(self._engine)
+        self._conn = connection if connection else self.connect()
+        self.__post_init__()
+
+    @classmethod
+    def _from(cls, *args: List, **kwargs: Dict):  # type: ignore[no-untyped-def,override]
+        raise NotImplementedError("This adapter can not be used for indexing documents")
+
+    def get_collections(self, session: sqlalchemy.orm.Session) -> Any:
+        if self.CollectionStore is None:
+            raise RuntimeError(
+                "Collection can't be accessed without specifying "
+                "dimension size of embedding vectors"
+            )
+        return self.CollectionStore.get_by_names(session, self.collection_names)
+
+    def _query_collection(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[Dict[str, str]] = None,
+    ) -> List[Any]:
+        """Query multiple collections."""
+        self._init_models(embedding)
+        with self.Session() as session:
+            collections = self.get_collections(session)
+            if not collections:
+                raise ValueError("No collections found")
+            return self._query_collection_multi(
+                collections=collections, embedding=embedding, k=k, filter=filter
+            )
diff --git a/libs/langchain/langchain/vectorstores/cratedb/model.py b/libs/langchain/langchain/vectorstores/cratedb/model.py
index 656de41bf4d45..1aec9b49a7260 100644
--- a/libs/langchain/langchain/vectorstores/cratedb/model.py
+++ b/libs/langchain/langchain/vectorstores/cratedb/model.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import sqlalchemy
 from crate.client.sqlalchemy.types import ObjectType
@@ -62,6 +62,19 @@ def get_by_name(
                         raise
                 return None
 
+            @classmethod
+            def get_by_names(
+                cls, session: Session, names: List[str]
+            ) -> Optional["List[CollectionStore]"]:
+                try:
+                    return (
+                        session.query(cls).filter(cls.name.in_(names)).all()  # type: ignore[attr-defined]  # noqa: E501
+                    )
+                except sqlalchemy.exc.ProgrammingError as ex:
+                    if "RelationUnknown" not in str(ex):
+                        raise
+                return None
+
             @classmethod
             def get_or_create(
                 cls,
diff --git a/libs/langchain/tests/integration_tests/vectorstores/fake_embeddings.py b/libs/langchain/tests/integration_tests/vectorstores/fake_embeddings.py
index 87ea1edc6a00b..209e933b24b61 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/fake_embeddings.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/fake_embeddings.py
@@ -52,7 +52,6 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
     def embed_query(self, text: str) -> List[float]:
         """Return consistent embeddings for the text, if seen before, or a constant
         one if the text is unknown."""
-        return self.embed_documents([text])[0]
         if text not in self.known_texts:
             return [float(1.0)] * (self.dimensionality - 1) + [float(0.0)]
         return [float(1.0)] * (self.dimensionality - 1) + [
diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py b/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
index d573843d2f02f..5a732ca5332f9 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
@@ -5,15 +5,17 @@
 docker-compose -f cratedb.yml up
 """
 import os
-from typing import Generator, List, Tuple
+from typing import Dict, Generator, List, Tuple
 
 import pytest
 import sqlalchemy as sa
+import sqlalchemy.orm
 from sqlalchemy.exc import ProgrammingError
 from sqlalchemy.orm import Session
 
 from langchain.docstore.document import Document
 from langchain.vectorstores.cratedb import CrateDBVectorSearch
+from langchain.vectorstores.cratedb.extended import CrateDBVectorSearchMultiCollection
 from langchain.vectorstores.cratedb.model import ModelFactory
 from tests.integration_tests.vectorstores.fake_embeddings import (
     ConsistentFakeEmbeddings,
@@ -151,17 +153,17 @@ def embed_query(self, text: str) -> List[float]:
         return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)]
 
 
-class ConsistentFakeEmbeddingsWithAdaDimension(
-    FakeEmbeddingsWithAdaDimension, ConsistentFakeEmbeddings
-):
+class ConsistentFakeEmbeddingsWithAdaDimension(ConsistentFakeEmbeddings):
     """
-    Fake embeddings which remember all the texts seen so far to return consistent
-    vectors for the same texts.
+    Fake embeddings which remember all the texts seen so far to return
+    consistent vectors for the same texts.
 
-    Other than this, they also have a dimensionality, which is important in this case.
+    Other than this, they also have a fixed dimensionality, which is
+    important in this case.
     """
 
-    pass
+    def __init__(self, *args: List, **kwargs: Dict) -> None:
+        super().__init__(dimensionality=ADA_TOKEN_COUNT)
 
 
 def test_cratedb_texts() -> None:
@@ -223,12 +225,7 @@ def test_cratedb_with_metadatas_with_scores() -> None:
         pre_delete_collection=True,
     )
     output = docsearch.similarity_search_with_score("foo", k=1)
-    # TODO: Original:
-    #       assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]  # noqa: E501
-    assert output in [
-        [(Document(page_content="foo", metadata={"page": "0"}), 1.0828735)],
-        [(Document(page_content="foo", metadata={"page": "0"}), 1.1307646)],
-    ]
+    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 2.0)]
 
 
 def test_cratedb_with_filter_match() -> None:
@@ -247,9 +244,8 @@ def test_cratedb_with_filter_match() -> None:
     # TODO: Original:
     #       assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]  # noqa: E501
     assert output in [
-        [(Document(page_content="foo", metadata={"page": "0"}), 1.2615292)],
-        [(Document(page_content="foo", metadata={"page": "0"}), 1.3979403)],
-        [(Document(page_content="foo", metadata={"page": "0"}), 1.5065275)],
+        [(Document(page_content="foo", metadata={"page": "0"}), 2.1307645)],
+        [(Document(page_content="foo", metadata={"page": "0"}), 2.3150668)],
     ]
 
 
@@ -265,10 +261,9 @@ def test_cratedb_with_filter_distant_match() -> None:
         connection_string=CONNECTION_STRING,
         pre_delete_collection=True,
     )
+    output = docsearch.similarity_search_with_score("foo", k=2, filter={"page": "2"})
     # TODO: Original:
-    # output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"})
-    output = docsearch.similarity_search_with_score("foo", k=3, filter={"page": "2"})
-    # TODO: Original:
+    #       output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"})  # noqa: E501
     #       assert output == [
     #         (Document(page_content="baz", metadata={"page": "2"}), 0.0013003906671379406)  # noqa: E501
     #       ]
@@ -277,9 +272,10 @@ def test_cratedb_with_filter_distant_match() -> None:
         Document(page_content="baz", metadata={"page": "2"}),
     ]
     assert scores in [
-        [0.5],
-        [0.6],
-        [0.7],
+        [1.3],
+        [1.5],
+        [1.6],
+        [1.7],
     ]
 
 
@@ -439,7 +435,7 @@ def test_cratedb_with_filter_in_set() -> None:
         Document(page_content="foo", metadata={"page": "0"}),
         Document(page_content="baz", metadata={"page": "2"}),
     ]
-    assert scores == [2.1, 1.3]
+    assert scores == [3.0, 2.2]
 
 
 def test_cratedb_delete_docs() -> None:
@@ -498,7 +494,7 @@ def test_cratedb_relevance_score() -> None:
         Document(page_content="bar", metadata={"page": "1"}),
         Document(page_content="baz", metadata={"page": "2"}),
     ]
-    assert scores == [0.8, 0.4, 0.2]
+    assert scores == [1.4, 1.1, 0.8]
 
 
 def test_cratedb_retriever_search_threshold() -> None:
@@ -516,9 +512,7 @@ def test_cratedb_retriever_search_threshold() -> None:
 
     retriever = docsearch.as_retriever(
         search_type="similarity_score_threshold",
-        # TODO: Original:
-        #       search_kwargs={"k": 3, "score_threshold": 0.999},
-        search_kwargs={"k": 3, "score_threshold": 0.333},
+        search_kwargs={"k": 3, "score_threshold": 0.999},
     )
     output = retriever.get_relevant_documents("summer")
     assert output == [
@@ -574,10 +568,77 @@ def test_cratedb_max_marginal_relevance_search_with_score() -> None:
         pre_delete_collection=True,
     )
     output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3)
-    # TODO: Original:
-    #       assert output == [(Document(page_content="foo"), 0.0)]
-    assert output in [
-        [(Document(page_content="foo"), 1.0606961)],
-        [(Document(page_content="foo"), 1.0828735)],
-        [(Document(page_content="foo"), 1.1307646)],
-    ]
+    assert output == [(Document(page_content="foo"), 2.0)]
+
+
+def test_cratedb_multicollection_search_success() -> None:
+    """
+    `CrateDBVectorSearchMultiCollection` provides functionality for
+    searching multiple collections.
+    """
+
+    store_1 = CrateDBVectorSearch.from_texts(
+        texts=["Räuber", "Hotzenplotz"],
+        collection_name="test_collection_1",
+        embedding=ConsistentFakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+    _ = CrateDBVectorSearch.from_texts(
+        texts=["John", "Doe"],
+        collection_name="test_collection_2",
+        embedding=ConsistentFakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+
+    # Probe the first store.
+    output = store_1.similarity_search("Räuber", k=1)
+    assert Document(page_content="Räuber") in output[:2]
+    output = store_1.similarity_search("Hotzenplotz", k=1)
+    assert Document(page_content="Hotzenplotz") in output[:2]
+    output = store_1.similarity_search("John Doe", k=1)
+    assert Document(page_content="Räuber") in output[:2]
+
+    # Probe the multi-store.
+    multisearch = CrateDBVectorSearchMultiCollection(
+        collection_names=["test_collection_1", "test_collection_2"],
+        embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+    )
+    output = multisearch.similarity_search("Räuber Hotzenplotz", k=2)
+    assert Document(page_content="Räuber") in output[:2]
+    output = multisearch.similarity_search("John Doe", k=2)
+    assert Document(page_content="John") in output[:2]
+
+
+def test_cratedb_multicollection_fail_indexing_not_permitted() -> None:
+    """
+    `CrateDBVectorSearchMultiCollection` does not provide functionality for
+    indexing documents.
+    """
+
+    with pytest.raises(NotImplementedError) as ex:
+        CrateDBVectorSearchMultiCollection.from_texts(
+            texts=["foo"],
+            collection_name="test_collection",
+            embedding=FakeEmbeddingsWithAdaDimension(),
+            connection_string=CONNECTION_STRING,
+        )
+    assert ex.match("This adapter can not be used for indexing documents")
+
+
+def test_cratedb_multicollection_search_no_collections() -> None:
+    """
+    `CrateDBVectorSearchMultiCollection` will fail when not able to identify
+    collections to search in.
+    """
+
+    store = CrateDBVectorSearchMultiCollection(
+        collection_names=["unknown"],
+        embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+    )
+    with pytest.raises(ValueError) as ex:
+        store.similarity_search("foo")
+    assert ex.match("No collections found")

From d9569d1693808107cb5019f2b26d9c8b5de4a807 Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@crate.io>
Date: Tue, 21 Nov 2023 16:32:21 +0100
Subject: [PATCH 2/3] CrateDB vector: Improve SQLAlchemy data model query
 utility functions

The CrateDB adapter works a bit different compared to the pgvector
adapter it is building upon: Because the dimensionality of the vector
field needs to be specified at table creation time, but because it is
also a runtime parameter in LangChain, the table creation needs to be
delayed.

In some cases, the tables do not exist yet, but this is only relevant
for the case when the user requests to pre-delete the collection, using
the `pre_delete_collection` argument. So, do the error handling only
there instead, and _not_ on the generic data model utility functions.
---
 .../langchain/vectorstores/cratedb/base.py    | 16 ++++++++++-
 .../langchain/vectorstores/cratedb/model.py   | 18 ++-----------
 .../vectorstores/test_cratedb.py              | 27 ++++++++++++++++++-
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/libs/langchain/langchain/vectorstores/cratedb/base.py b/libs/langchain/langchain/vectorstores/cratedb/base.py
index 922ba2ed659d6..166371325e190 100644
--- a/libs/langchain/langchain/vectorstores/cratedb/base.py
+++ b/libs/langchain/langchain/vectorstores/cratedb/base.py
@@ -164,10 +164,24 @@ def add_embeddings(
         if not embeddings:
             return []
         self._init_models(embeddings[0])
+
+        # When the user requested to delete the collection before running subsequent
+        # operations on it, run the deletion gracefully if the table does not exist
+        # yet.
         if self.pre_delete_collection:
-            self.delete_collection()
+            try:
+                self.delete_collection()
+            except sqlalchemy.exc.ProgrammingError as ex:
+                if "RelationUnknown" not in str(ex):
+                    raise
+
+        # Tables need to be created at runtime, because the `EmbeddingStore.embedding`
+        # field, a `FloatVector`, needs to be initialized with a dimensionality
+        # parameter, which is only obtained at runtime.
         self.create_tables_if_not_exists()
         self.create_collection()
+
+        # After setting up the table/collection at runtime, add embeddings.
         return super().add_embeddings(
             texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
         )
diff --git a/libs/langchain/langchain/vectorstores/cratedb/model.py b/libs/langchain/langchain/vectorstores/cratedb/model.py
index 1aec9b49a7260..c540ba2eb217f 100644
--- a/libs/langchain/langchain/vectorstores/cratedb/model.py
+++ b/libs/langchain/langchain/vectorstores/cratedb/model.py
@@ -53,27 +53,13 @@ class CollectionStore(BaseModel):
             def get_by_name(
                 cls, session: Session, name: str
             ) -> Optional["CollectionStore"]:
-                try:
-                    return (
-                        session.query(cls).filter(cls.name == name).first()  # type: ignore[attr-defined]  # noqa: E501
-                    )
-                except sqlalchemy.exc.ProgrammingError as ex:
-                    if "RelationUnknown" not in str(ex):
-                        raise
-                return None
+                return session.query(cls).filter(cls.name == name).first()  # type: ignore[attr-defined]
 
             @classmethod
             def get_by_names(
                 cls, session: Session, names: List[str]
             ) -> Optional["List[CollectionStore]"]:
-                try:
-                    return (
-                        session.query(cls).filter(cls.name.in_(names)).all()  # type: ignore[attr-defined]  # noqa: E501
-                    )
-                except sqlalchemy.exc.ProgrammingError as ex:
-                    if "RelationUnknown" not in str(ex):
-                        raise
-                return None
+                return session.query(cls).filter(cls.name.in_(names)).all()  # type: ignore[attr-defined]
 
             @classmethod
             def get_or_create(
diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py b/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
index 5a732ca5332f9..1862ca895733b 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
@@ -5,6 +5,7 @@
 docker-compose -f cratedb.yml up
 """
 import os
+import re
 from typing import Dict, Generator, List, Tuple
 
 import pytest
@@ -628,12 +629,36 @@ def test_cratedb_multicollection_fail_indexing_not_permitted() -> None:
     assert ex.match("This adapter can not be used for indexing documents")
 
 
-def test_cratedb_multicollection_search_no_collections() -> None:
+def test_cratedb_multicollection_search_table_does_not_exist() -> None:
+    """
+    `CrateDBVectorSearchMultiCollection` will fail when the `collection`
+    table does not exist.
+    """
+
+    store = CrateDBVectorSearchMultiCollection(
+        collection_names=["unknown"],
+        embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+    )
+    with pytest.raises(ProgrammingError) as ex:
+        store.similarity_search("foo")
+    assert ex.match(re.escape("RelationUnknown[Relation 'collection' unknown]"))
+
+
+def test_cratedb_multicollection_search_unknown_collection() -> None:
     """
     `CrateDBVectorSearchMultiCollection` will fail when not able to identify
     collections to search in.
     """
 
+    CrateDBVectorSearch.from_texts(
+        texts=["Räuber", "Hotzenplotz"],
+        collection_name="test_collection",
+        embedding=ConsistentFakeEmbeddingsWithAdaDimension(),
+        connection_string=CONNECTION_STRING,
+        pre_delete_collection=True,
+    )
+
     store = CrateDBVectorSearchMultiCollection(
         collection_names=["unknown"],
         embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(),

From d8ecebe712a81e2220f2199ad5c925d9c860133c Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@crate.io>
Date: Tue, 21 Nov 2023 16:45:01 +0100
Subject: [PATCH 3/3] CrateDB vector: Improve testing when initialized without
 dimensionality

---
 .../vectorstores/test_cratedb.py              | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py b/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
index 1862ca895733b..86370439e6dce 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_cratedb.py
@@ -364,11 +364,12 @@ def test_cratedb_collection_with_metadata() -> None:
 
 
 def test_cratedb_collection_no_embedding_dimension() -> None:
-    """Test end to end collection construction"""
+    """
+    Verify that addressing collections fails when not specifying dimensions.
+    """
     cratedb_vector = CrateDBVectorSearch(
         embedding_function=None,  # type: ignore[arg-type]
         connection_string=CONNECTION_STRING,
-        pre_delete_collection=True,
     )
     session = Session(cratedb_vector.connect())
     with pytest.raises(RuntimeError) as ex:
@@ -667,3 +668,20 @@ def test_cratedb_multicollection_search_unknown_collection() -> None:
     with pytest.raises(ValueError) as ex:
         store.similarity_search("foo")
     assert ex.match("No collections found")
+
+
+def test_cratedb_multicollection_no_embedding_dimension() -> None:
+    """
+    Verify that addressing collections fails when not specifying dimensions.
+    """
+    store = CrateDBVectorSearchMultiCollection(
+        embedding_function=None,  # type: ignore[arg-type]
+        connection_string=CONNECTION_STRING,
+    )
+    session = Session(store.connect())
+    with pytest.raises(RuntimeError) as ex:
+        store.get_collection(session)
+    assert ex.match(
+        "Collection can't be accessed without specifying "
+        "dimension size of embedding vectors"
+    )