From f5ac4d153ed6b2abaae2ccec7c55917e3813d07b Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 15 Nov 2024 15:14:48 +0100 Subject: [PATCH] update to match --- ...ual_pdf_rag_with_vespa_colpali_cloud.ipynb | 97 +++++++++++-------- 1 file changed, 54 insertions(+), 43 deletions(-) diff --git a/docs/sphinx/source/examples/visual_pdf_rag_with_vespa_colpali_cloud.ipynb b/docs/sphinx/source/examples/visual_pdf_rag_with_vespa_colpali_cloud.ipynb index 34c450d7..6205d6e1 100644 --- a/docs/sphinx/source/examples/visual_pdf_rag_with_vespa_colpali_cloud.ipynb +++ b/docs/sphinx/source/examples/visual_pdf_rag_with_vespa_colpali_cloud.ipynb @@ -1265,14 +1265,13 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "5b7acbb6", "metadata": { "id": "5b7acbb6" }, "outputs": [], "source": [ - "# Define the Vespa schema\n", "colpali_schema = Schema(\n", " name=VESPA_SCHEMA_NAME,\n", " document=Document(\n", @@ -1293,9 +1292,7 @@ " index=\"enable-bm25\",\n", " ),\n", " Field(name=\"page_number\", type=\"int\", indexing=[\"summary\", \"attribute\"]),\n", - " Field(\n", - " name=\"blur_image\", type=\"raw\", indexing=[\"summary\"]\n", - " ), # We store the images as base64-encoded strings\n", + " Field(name=\"blur_image\", type=\"raw\", indexing=[\"summary\"]),\n", " Field(name=\"full_image\", type=\"raw\", indexing=[\"summary\"]),\n", " Field(\n", " name=\"text\",\n", @@ -1305,31 +1302,29 @@ " index=\"enable-bm25\",\n", " ),\n", " Field(\n", - " name=\"embedding\", # The page image embeddings are stored as binary tensors (represented by signed 8-bit integers)\n", + " name=\"embedding\",\n", " type=\"tensor(patch{}, v[16])\",\n", " indexing=[\n", " \"attribute\",\n", " \"index\",\n", " ],\n", - " ann=HNSW( # Since we are using binary embeddings, we use the HNSW algorithm with Hamming distance as the metric, which is highly efficient in Vespa, see https://blog.vespa.ai/scaling-colpali-to-billions/\n", + " ann=HNSW(\n", " distance_metric=\"hamming\",\n", " max_links_per_node=32,\n", " neighbors_to_explore_at_insert=400,\n", " ),\n", " ),\n", " Field(\n", - " name=\"questions\", # We store the generated questions and queries as arrays of strings for each document\n", + " name=\"questions\",\n", " type=\"array\",\n", - " indexing=[\"summary\", \"index\", \"attribute\"],\n", - " index=\"enable-bm25\",\n", - " stemming=\"best\",\n", + " indexing=[\"summary\", \"attribute\"],\n", + " summary=Summary(fields=[\"matched-elements-only\"]),\n", " ),\n", " Field(\n", " name=\"queries\",\n", " type=\"array\",\n", - " indexing=[\"summary\", \"index\", \"attribute\"],\n", - " index=\"enable-bm25\",\n", - " stemming=\"best\",\n", + " indexing=[\"summary\", \"attribute\"],\n", + " summary=Summary(fields=[\"matched-elements-only\"]),\n", " ),\n", " ]\n", " ),\n", @@ -1389,7 +1384,7 @@ "]\n", "\n", "# Define the 'bm25' rank profile\n", - "colpali_bm25_profile = RankProfile(\n", + "bm25 = RankProfile(\n", " name=\"bm25\",\n", " inputs=[(\"query(qt)\", \"tensor(querytoken{}, v[128])\")],\n", " first_phase=\"bm25(title) + bm25(text)\",\n", @@ -1407,14 +1402,27 @@ " )\n", "\n", "\n", - "colpali_schema.add_rank_profile(colpali_bm25_profile)\n", - "colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))\n", + "colpali_schema.add_rank_profile(bm25)\n", + "colpali_schema.add_rank_profile(with_quantized_similarity(bm25))\n", "\n", - "# Update the 'default' rank profile\n", - "colpali_profile = RankProfile(\n", - " name=\"default\",\n", - " inputs=[(\"query(qt)\", \"tensor(querytoken{}, v[128])\")],\n", - " first_phase=\"bm25_score\",\n", + "\n", + "# Update the 'colpali' rank profile\n", + "input_query_tensors = []\n", + "MAX_QUERY_TERMS = 64\n", + "for i in range(MAX_QUERY_TERMS):\n", + " input_query_tensors.append((f\"query(rq{i})\", \"tensor(v[16])\"))\n", + "\n", + "input_query_tensors.extend(\n", + " [\n", + " (\"query(qt)\", \"tensor(querytoken{}, v[128])\"),\n", + " (\"query(qtb)\", \"tensor(querytoken{}, v[16])\"),\n", + " ]\n", + ")\n", + "\n", + "colpali = RankProfile(\n", + " name=\"colpali\",\n", + " inputs=input_query_tensors,\n", + " first_phase=\"max_sim_binary\",\n", " second_phase=SecondPhaseRanking(expression=\"max_sim\", rerank_count=10),\n", " functions=mapfunctions\n", " + [\n", @@ -1432,30 +1440,33 @@ " )\n", " \"\"\",\n", " ),\n", - " Function(name=\"bm25_score\", expression=\"bm25(title) + bm25(text)\"),\n", + " Function(\n", + " name=\"max_sim_binary\",\n", + " expression=\"\"\"\n", + " sum(\n", + " reduce(\n", + " 1 / (1 + sum(\n", + " hamming(query(qtb), attribute(embedding)), v)\n", + " ),\n", + " max, patch\n", + " ),\n", + " querytoken\n", + " )\n", + " \"\"\",\n", + " ),\n", " ],\n", ")\n", - "colpali_schema.add_rank_profile(colpali_profile)\n", - "colpali_schema.add_rank_profile(with_quantized_similarity(colpali_profile))\n", - "\n", - "# Update the 'retrieval-and-rerank' rank profile\n", - "input_query_tensors = []\n", - "MAX_QUERY_TERMS = 64\n", - "for i in range(MAX_QUERY_TERMS):\n", - " input_query_tensors.append((f\"query(rq{i})\", \"tensor(v[16])\"))\n", - "\n", - "input_query_tensors.extend(\n", - " [\n", - " (\"query(qt)\", \"tensor(querytoken{}, v[128])\"),\n", - " (\"query(qtb)\", \"tensor(querytoken{}, v[16])\"),\n", - " ]\n", - ")\n", + "colpali_schema.add_rank_profile(colpali)\n", + "colpali_schema.add_rank_profile(with_quantized_similarity(colpali))\n", "\n", - "colpali_retrieval_profile = RankProfile(\n", - " name=\"retrieval-and-rerank\",\n", + "# Update the 'hybrid' rank profile\n", + "hybrid = RankProfile(\n", + " name=\"hybrid\",\n", " inputs=input_query_tensors,\n", " first_phase=\"max_sim_binary\",\n", - " second_phase=SecondPhaseRanking(expression=\"max_sim\", rerank_count=10),\n", + " second_phase=SecondPhaseRanking(\n", + " expression=\"max_sim + 2 * (bm25(text) + bm25(title))\", rerank_count=10\n", + " ),\n", " functions=mapfunctions\n", " + [\n", " Function(\n", @@ -1488,8 +1499,8 @@ " ),\n", " ],\n", ")\n", - "colpali_schema.add_rank_profile(colpali_retrieval_profile)\n", - "colpali_schema.add_rank_profile(with_quantized_similarity(colpali_retrieval_profile))" + "colpali_schema.add_rank_profile(hybrid)\n", + "colpali_schema.add_rank_profile(with_quantized_similarity(hybrid))" ] }, {