From afbeacfbbbd88394276cfa0cef233600c566e691 Mon Sep 17 00:00:00 2001 From: Filip Haltmayer <81822489+filip-halt@users.noreply.github.com> Date: Mon, 15 May 2023 16:14:48 -0700 Subject: [PATCH] Milvus load and guide fix (#3378) Signed-off-by: Filip Haltmayer --- .../vector_stores/MilvusIndexDemo.ipynb | 188 ++++-------------- llama_index/vector_stores/milvus.py | 3 + 2 files changed, 46 insertions(+), 145 deletions(-) diff --git a/docs/examples/vector_stores/MilvusIndexDemo.ipynb b/docs/examples/vector_stores/MilvusIndexDemo.ipynb index 3f57d0933f145..861a615d1e38b 100644 --- a/docs/examples/vector_stores/MilvusIndexDemo.ipynb +++ b/docs/examples/vector_stores/MilvusIndexDemo.ipynb @@ -109,13 +109,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Document ID: 933666c4-2833-475a-a3d5-d279a0c174fa Document Hash: 77ae91ab542f3abb308c4d7c77c9bc4c9ad0ccd63144802b7cbe7e1bb3a4094e\n" + "Document ID: 05b6691b-d567-43a2-94e1-e9ca81cd4624 Document Hash: 77ae91ab542f3abb308c4d7c77c9bc4c9ad0ccd63144802b7cbe7e1bb3a4094e\n" ] } ], "source": [ "# load documents\n", - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", + "documents = SimpleDirectoryReader('../data/paul_graham/').load_data()\n", "print('Document ID:', documents[0].doc_id, 'Document Hash:', documents[0].doc_hash)" ] }, @@ -153,16 +153,7 @@ "is_executing": true } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 17617 tokens\n" - ] - } - ], + "outputs": [], "source": [ "# Create an index over the documnts\n", "from llama_index.storage.storage_context import StorageContext\n", @@ -193,53 +184,27 @@ "start_time": "2023-02-10T12:20:33.822688Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4028 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 6 tokens\n" - ] - } - ], - "source": [ - "query_engine = index.as_query_engine()\n", - "response = query_engine.query(\"What did the author learn?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "bedbb693-725f-478f-be26-fa7180ea38b2", - "metadata": { - "ExecuteTime": { - "end_time": "2023-02-10T12:20:51.337062Z", - "start_time": "2023-02-10T12:20:51.330857Z" - } - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " The author learned that working on things that are not prestigious can be a good thing, as it can\n", - "lead to discovering something real and avoiding the wrong track. The author also learned that\n", - "ignorance can be beneficial, as it can lead to discovering something new and unexpected. The author\n", - "also learned the importance of working hard, even at the parts of the job they don't like, in order\n", - "to set an example for others. The author also learned the value of unsolicited advice, as it can be\n", - "beneficial in unexpected ways, such as when Robert Morris suggested that the author should make sure\n", - "Y Combinator wasn't the last cool thing they did.\n" + " The author learned that the AI programs of the time were not capable of understanding natural\n", + "language, and that the field of AI was a hoax. He also learned that he could make art, and that he\n", + "could pass the entrance exam for the Accademia di Belli Arti in Florence. He also learned Lisp\n", + "hacking and wrote his dissertation on applications of continuations.\n" ] } ], "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"What did the author learn?\")\n", "print(textwrap.fill(str(response), 100))" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "99212d33", "metadata": { "ExecuteTime": { @@ -247,47 +212,19 @@ "start_time": "2023-02-10T12:20:51.338718Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4072 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 9 tokens\n" - ] - } - ], - "source": [ - "response = query_engine.query(\"What was a hard moment for the author?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1a720ad6", - "metadata": { - "ExecuteTime": { - "end_time": "2023-02-10T12:21:10.355872Z", - "start_time": "2023-02-10T12:21:10.343486Z" - } - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " A hard moment for the author was when he was dealing with urgent problems during YC and about 60%\n", - "of them had to do with Hacker News, a news aggregator he had created. He was overwhelmed by the\n", - "amount of work he had to do to keep Hacker News running, and it was taking away from his ability to\n", - "focus on other projects. He was also haunted by the idea that his own work ethic set the upper bound\n", - "for how hard everyone else worked, so he felt he had to work very hard. He was also dealing with\n", - "disputes between cofounders, figuring out when people were lying to them, and fighting with people\n", - "who maltreated the startups. On top of this, he was given unsolicited advice from Robert Morris to\n", - "make sure Y Combinator wasn't the last cool thing he did, which made him consider quitting.\n" + " A hard moment for the author was when he realized that the AI programs of the time were a hoax and\n", + "that there was an unbridgeable gap between what they could do and actually understanding natural\n", + "language.\n" ] } ], "source": [ + "response = query_engine.query(\"What was a hard moment for the author?\")\n", "print(textwrap.fill(str(response), 100))" ] }, @@ -302,25 +239,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "id": "8d641e24", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 5 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 44 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", "Res: \n", "The author is unknown.\n" ] @@ -329,12 +255,10 @@ "source": [ "vector_store = MilvusVectorStore(overwrite=True)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", - "index = GPTVectorStoreIndex.from_documents([Document(\"The answer is ten.\")], storage_context)\n", + "index = GPTVectorStoreIndex.from_documents([Document(\"The number that is being searched for is ten.\")], storage_context)\n", "query_engine = index.as_query_engine()\n", "res = query_engine.query(\"Who is the author?\")\n", - "\n", - "print(flush=True)\n", - "print(\"Res:\", res, flush=True)" + "print(\"Res:\", res)" ] }, { @@ -348,74 +272,48 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "id": "a5c429a4", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 44 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "The answer is ten.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 17617 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 41 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Ten.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 3720 tokens\n", - "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "The author of the text is Paul Graham, co-founder of Y Combinator.\n" + "Res: \n", + "The number is ten.\n" ] } ], "source": [ - "del index\n", - "\n", - "query_engine = index.as_query_engine()\n", - "print(query_engine.query(\"What is the answer.\"))\n", + "del index, vector_store, storage_context, query_engine\n", "\n", "vector_store = MilvusVectorStore(overwrite=False)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", "index = GPTVectorStoreIndex.from_documents(documents, storage_context=storage_context)\n", "query_engine = index.as_query_engine()\n", - "print(query_engine.query(\"What is the answer?\"))\n", - "print(query_engine.query(\"Who is the author?\"))\n" + "res = query_engine.query(\"What is the number?\")\n", + "print(\"Res:\", res)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e5287c2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Res: \n", + "The author is Paul Graham.\n" + ] + } + ], + "source": [ + "res = query_engine.query(\"Who is the author?\")\n", + "print(\"Res:\", res)" ] } ], diff --git a/llama_index/vector_stores/milvus.py b/llama_index/vector_stores/milvus.py index 256d6c953931c..77c85723b4e0d 100644 --- a/llama_index/vector_stores/milvus.py +++ b/llama_index/vector_stores/milvus.py @@ -310,6 +310,9 @@ def add(self, embedding_results: List[NodeWithEmbedding]) -> List[str]: assert self.collection is not None self.collection.load() + elif len(embedding_results) == 0: + return [] + ids = [] doc_ids = [] texts = []