From 3814b2a40bcaa917512890f7c8f35b05a7a1dad5 Mon Sep 17 00:00:00 2001 From: Jo Kristian Bergum Date: Tue, 1 Oct 2024 20:38:17 +0200 Subject: [PATCH 1/3] Use pip3 --- .../examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb b/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb index 6282bbd5..48ab3cbc 100644 --- a/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb +++ b/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb @@ -83,7 +83,7 @@ }, "outputs": [], "source": [ - "%pip install colpali-engine==0.3.1 pdf2image pypdf pyvespa vespacli requests numpy tqdm" + "! pip3 install colpali-engine==0.3.1 pdf2image pypdf pyvespa vespacli requests numpy tqdm" ] }, { From 5c6d9c64525d66783878104667119cc0f596ff50 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Wed, 2 Oct 2024 06:57:08 +0200 Subject: [PATCH 2/3] remove whitespace --- ...rieval-with-ColQwen2-vlm_Vespa-cloud.ipynb | 397 ++++++++++-------- 1 file changed, 214 insertions(+), 183 deletions(-) diff --git a/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb b/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb index 48ab3cbc..f1662784 100644 --- a/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb +++ b/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb @@ -12,9 +12,9 @@ " \"#Vespa\"\n", "\n", "\n", - "# PDF-Retrieval using ColQWen2 (ColPali) with Vespa \n", + "# PDF-Retrieval using ColQWen2 (ColPali) with Vespa\n", "\n", - "This notebook is a continuation of our notebooks related to the ColPali models for complex document retrieval. \n", + "This notebook is a continuation of our notebooks related to the ColPali models for complex document retrieval.\n", "\n", "This notebook demonstrates using the new [ColQWen2](https://huggingface.co/vidore/colqwen2-v0.1) model checkpoint.\n", "\n", @@ -22,16 +22,16 @@ "\n", "ColQWen2 is better than the previous ColPali model in the following ways:\n", "\n", - "- Its more accurate on the ViDoRe dataset (+5 nDCCG@5 points) \n", - "- It's permissive licensed as both the base model and adapter is using open-source licences (Apache 2.0 and MIT) \n", - "- It uses fewer patch embeddings than ColPaliGemma (from 1024 to 768), this reduces both compute and storage. \n", + "- Its more accurate on the ViDoRe dataset (+5 nDCCG@5 points)\n", + "- It's permissive licensed as both the base model and adapter is using open-source licences (Apache 2.0 and MIT)\n", + "- It uses fewer patch embeddings than ColPaliGemma (from 1024 to 768), this reduces both compute and storage.\n", "\n", - " See also [Scaling ColPali to billions of PDFs with Vespa](https://blog.vespa.ai/scaling-colpali-to-billions/)\n", + "See also [Scaling ColPali to billions of PDFs with Vespa](https://blog.vespa.ai/scaling-colpali-to-billions/)\n", "\n", "The TLDR; of this notebook:\n", "\n", - "- Generate an image per PDF page using [pdf2image](https://pypi.org/project/pdf2image/) \n", - "and also extract the text using [pypdf](https://pypdf.readthedocs.io/en/stable/user/extract-text.html). \n", + "- Generate an image per PDF page using [pdf2image](https://pypi.org/project/pdf2image/)\n", + " and also extract the text using [pypdf](https://pypdf.readthedocs.io/en/stable/user/extract-text.html).\n", "- For each page image, use ColPali to obtain the visual multi-vector embeddings\n", "\n", "Then we store visual embeddings in Vespa as a `int8` tensor, where we use a binary compression technique\n", @@ -47,16 +47,15 @@ "\n", "This allows us to scale ColPali to very large collections of PDF pages, while still providing accurate and fast retrieval.\n", "\n", - "Let us get started. \n", + "Let us get started.\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb)\n", "\n", + "Install dependencies:\n", "\n", - "Install dependencies: \n", + "Note that the python pdf2image package requires poppler-utils, see other installation options [here](https://pdf2image.readthedocs.io/en/latest/installation.html#installing-poppler).\n", "\n", - "Note that the python pdf2image package requires poppler-utils, see other installation options [here](https://pdf2image.readthedocs.io/en/latest/installation.html#installing-poppler). \n", - "\n", - "For MacOs, the simplest install option is `brew install poppler` if you are using [Homebrew](https://brew.sh/)." + "For MacOs, the simplest install option is `brew install poppler` if you are using [Homebrew](https://brew.sh/).\n" ] }, { @@ -72,7 +71,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now install the required python packages:" + "Now install the required python packages:\n" ] }, { @@ -83,7 +82,7 @@ }, "outputs": [], "source": [ - "! pip3 install colpali-engine==0.3.1 pdf2image pypdf pyvespa vespacli requests numpy tqdm" + "!pip3 install colpali-engine==0.3.1 pdf2image pypdf pyvespa vespacli requests numpy tqdm" ] }, { @@ -109,7 +108,7 @@ "source": [ "### Load the model\n", "\n", - "We use device map auto to load the model on the available GPU if available, otherwise on the CPU or MPS if available." + "We use device map auto to load the model on the available GPU if available, otherwise on the CPU or MPS if available.\n" ] }, { @@ -304,12 +303,10 @@ "model_name = \"vidore/colqwen2-v0.1\"\n", "\n", "model = ColQwen2.from_pretrained(\n", - " model_name,\n", - " torch_dtype=torch.bfloat16,\n", - " device_map=\"auto\")\n", + " model_name, torch_dtype=torch.bfloat16, device_map=\"auto\"\n", + ")\n", "processor = ColQwen2Processor.from_pretrained(model_name)\n", - "model = model.eval()\n", - "\n" + "model = model.eval()" ] }, { @@ -320,10 +317,10 @@ "source": [ "### Working with pdfs\n", "\n", - "We need to convert a PDF to an array of images. One image per page. \n", - "We use the `pdf2image` library for this task. Secondary, we also extract the text contents of the PDF using `pypdf`. \n", + "We need to convert a PDF to an array of images. One image per page.\n", + "We use the `pdf2image` library for this task. Secondary, we also extract the text contents of the PDF using `pypdf`.\n", "\n", - "NOTE: This step requires that you have `poppler` installed on your system. Read more in [pdf2image](https://pdf2image.readthedocs.io/en/latest/installation.html) docs." + "NOTE: This step requires that you have `poppler` installed on your system. Read more in [pdf2image](https://pdf2image.readthedocs.io/en/latest/installation.html) docs.\n" ] }, { @@ -338,6 +335,7 @@ "from pdf2image import convert_from_path\n", "from pypdf import PdfReader\n", "\n", + "\n", "def download_pdf(url):\n", " response = requests.get(url)\n", " if response.status_code == 200:\n", @@ -345,6 +343,7 @@ " else:\n", " raise Exception(f\"Failed to download PDF: Status code {response.status_code}\")\n", "\n", + "\n", "def get_pdf_images(pdf_url):\n", " # Download the PDF\n", " pdf_file = download_pdf(pdf_url)\n", @@ -367,7 +366,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We define a few sample PDFs to work with. The PDFs are discovered from [this url](https://www.conocophillips.com/company-reports-resources/sustainability-reporting/)." + "We define a few sample PDFs to work with. The PDFs are discovered from [this url](https://www.conocophillips.com/company-reports-resources/sustainability-reporting/).\n" ] }, { @@ -379,18 +378,18 @@ "outputs": [], "source": [ "sample_pdfs = [\n", - " {\n", - " \"title\": \"ConocoPhillips Sustainability Highlights - Nature (24-0976)\",\n", - " \"url\": \"https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf\"\n", - " },\n", - " {\n", - " \"title\": \"ConocoPhillips Managing Climate Related Risks\",\n", - " \"url\": \"https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf\"\n", - " },\n", - " {\n", - " \"title\": \"ConocoPhillips 2023 Sustainability Report\",\n", - " \"url\": \"https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf\" \n", - " }\n", + " {\n", + " \"title\": \"ConocoPhillips Sustainability Highlights - Nature (24-0976)\",\n", + " \"url\": \"https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf\",\n", + " },\n", + " {\n", + " \"title\": \"ConocoPhillips Managing Climate Related Risks\",\n", + " \"url\": \"https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf\",\n", + " },\n", + " {\n", + " \"title\": \"ConocoPhillips 2023 Sustainability Report\",\n", + " \"url\": \"https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf\",\n", + " },\n", "]" ] }, @@ -398,7 +397,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can convert the PDFs to images and also extract the text content." + "Now we can convert the PDFs to images and also extract the text content.\n" ] }, { @@ -410,9 +409,9 @@ "outputs": [], "source": [ "for pdf in sample_pdfs:\n", - " page_images, page_texts = get_pdf_images(pdf['url'])\n", - " pdf['images'] = page_images\n", - " pdf['texts'] = page_texts\n" + " page_images, page_texts = get_pdf_images(pdf[\"url\"])\n", + " pdf[\"images\"] = page_images\n", + " pdf[\"texts\"] = page_texts" ] }, { @@ -421,7 +420,7 @@ "id": "b3vBUFwATIqk" }, "source": [ - "Let us look at the extracted image of the first PDF page. This is the document side input to ColPali, one image per page." + "Let us look at the extracted image of the first PDF page. This is the document side input to ColPali, one image per page.\n" ] }, { @@ -451,6 +450,7 @@ "source": [ "from IPython.display import display\n", "\n", + "\n", "def resize_image(image, max_height=800):\n", " width, height = image.size\n", " if height > max_height:\n", @@ -460,14 +460,15 @@ " return image.resize((new_width, new_height))\n", " return image\n", "\n", - "display(resize_image(sample_pdfs[0]['images'][0]))" + "\n", + "display(resize_image(sample_pdfs[0][\"images\"][0]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let us also look at the extracted text content of the first PDF page. " + "Let us also look at the extracted text content of the first PDF page.\n" ] }, { @@ -517,25 +518,25 @@ } ], "source": [ - "print(sample_pdfs[0]['texts'][0])" + "print(sample_pdfs[0][\"texts\"][0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notice how the layout and order of the text is different from the image representation. Note that \n", + "Notice how the layout and order of the text is different from the image representation. Note that\n", "\n", "- The headlines NATURE and Sustainability have been combined into one word (NATURESustainability).\n", "- The 0.03% has been converted to 0.03 and order is not preserved in the text representation.\n", - "- The data in the infographics is not represented in the text representation. " + "- The data in the infographics is not represented in the text representation.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we use the ColPali model to generate embeddings of the images." + "Now we use the ColPali model to generate embeddings of the images.\n" ] }, { @@ -560,44 +561,40 @@ } ], "source": [ - "\n", "for pdf in sample_pdfs:\n", - " page_embeddings = []\n", - " dataloader = DataLoader(\n", - " pdf['images'],\n", + " page_embeddings = []\n", + " dataloader = DataLoader(\n", + " pdf[\"images\"],\n", " batch_size=2,\n", " shuffle=False,\n", " collate_fn=lambda x: processor.process_images(x),\n", - " )\n", - "\n", - " for batch_doc in tqdm(dataloader):\n", - " with torch.no_grad():\n", - " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", - " embeddings_doc = model(**batch_doc)\n", - " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", - " pdf['embeddings'] = page_embeddings\n", - " \n", + " )\n", "\n", - " " + " for batch_doc in tqdm(dataloader):\n", + " with torch.no_grad():\n", + " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", + " embeddings_doc = model(**batch_doc)\n", + " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", + " pdf[\"embeddings\"] = page_embeddings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we are done with the document side embeddings, we convert the embeddings to Vespa JSON format so we can store (and index) them in Vespa. \n", + "Now we are done with the document side embeddings, we convert the embeddings to Vespa JSON format so we can store (and index) them in Vespa.\n", "Details in [Vespa JSON feed format doc](https://docs.vespa.ai/en/reference/document-json-format.html).\n", "\n", "We use binary quantization (BQ) of the page level ColPali vector embeddings to reduce their size by 32x.\n", "\n", - "Read more about binarization of multi-vector representations in the [colbert blog post](https://blog.vespa.ai/announcing-colbert-embedder-in-vespa/). \n", + "Read more about binarization of multi-vector representations in the [colbert blog post](https://blog.vespa.ai/announcing-colbert-embedder-in-vespa/).\n", "\n", - "The binarization step maps 128 dimensional floats to 128 bits, or 16 bytes per vector. \n", + "The binarization step maps 128 dimensional floats to 128 bits, or 16 bytes per vector.\n", "\n", - "Reducing the size by 32x. On the [DocVQA benchmark](https://huggingface.co/datasets/vidore/docvqa_test_subsampled), binarization results in a small drop in ranking accuracy. \n", + "Reducing the size by 32x. On the [DocVQA benchmark](https://huggingface.co/datasets/vidore/docvqa_test_subsampled), binarization results in a small drop in ranking accuracy.\n", "\n", "We also demonstrate how to store the image data in Vespa using the [raw](https://docs.vespa.ai/en/reference/schema-reference.html#raw) type for binary data. To encode\n", - "the binary data in JSON, we use base64 encoding. " + "the binary data in JSON, we use base64 encoding.\n" ] }, { @@ -607,6 +604,8 @@ "outputs": [], "source": [ "import base64\n", + "\n", + "\n", "def get_base64_image(image):\n", " buffered = BytesIO()\n", " image.save(buffered, format=\"JPEG\")\n", @@ -622,26 +621,34 @@ "outputs": [], "source": [ "import numpy as np\n", + "\n", "vespa_feed = []\n", "for pdf in sample_pdfs:\n", - " url = pdf['url']\n", - " title = pdf['title']\n", - " for page_number, (page_text, embedding, image) in enumerate(zip(pdf['texts'], pdf['embeddings'], pdf['images'])):\n", - " base_64_image = get_base64_image(resize_image(image,640))\n", - " embedding_dict = dict()\n", - " for idx, patch_embedding in enumerate(embedding):\n", - " binary_vector = np.packbits(np.where(patch_embedding > 0, 1, 0)).astype(np.int8).tobytes().hex()\n", - " embedding_dict[idx] = binary_vector \n", - " page = {\n", - " \"id\": hash(url + str(page_number)),\n", - " \"url\": url,\n", - " \"title\": title,\n", - " \"page_number\": page_number,\n", - " \"image\": base_64_image,\n", - " \"text\": page_text,\n", - " \"embedding\": embedding_dict\n", - " }\n", - " vespa_feed.append(page)" + " url = pdf[\"url\"]\n", + " title = pdf[\"title\"]\n", + " for page_number, (page_text, embedding, image) in enumerate(\n", + " zip(pdf[\"texts\"], pdf[\"embeddings\"], pdf[\"images\"])\n", + " ):\n", + " base_64_image = get_base64_image(resize_image(image, 640))\n", + " embedding_dict = dict()\n", + " for idx, patch_embedding in enumerate(embedding):\n", + " binary_vector = (\n", + " np.packbits(np.where(patch_embedding > 0, 1, 0))\n", + " .astype(np.int8)\n", + " .tobytes()\n", + " .hex()\n", + " )\n", + " embedding_dict[idx] = binary_vector\n", + " page = {\n", + " \"id\": hash(url + str(page_number)),\n", + " \"url\": url,\n", + " \"title\": title,\n", + " \"page_number\": page_number,\n", + " \"image\": base_64_image,\n", + " \"text\": page_text,\n", + " \"embedding\": embedding_dict,\n", + " }\n", + " vespa_feed.append(page)" ] }, { @@ -649,10 +656,11 @@ "metadata": {}, "source": [ "### Configure Vespa\n", + "\n", "[PyVespa](https://pyvespa.readthedocs.io/en/latest/) helps us build the [Vespa application package](https://docs.vespa.ai/en/application-packages.html).\n", "A Vespa application package consists of configuration files, schemas, models, and code (plugins).\n", "\n", - "First, we define a [Vespa schema](https://docs.vespa.ai/en/schemas.html) with the fields we want to store and their type." + "First, we define a [Vespa schema](https://docs.vespa.ai/en/schemas.html) with the fields we want to store and their type.\n" ] }, { @@ -661,50 +669,70 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "from vespa.package import Schema, Document, Field, FieldSet, HNSW\n", "\n", "colpali_schema = Schema(\n", " name=\"pdf_page\",\n", " document=Document(\n", " fields=[\n", - " Field(name=\"id\", type=\"string\", indexing=[\"summary\", \"index\"], match=[\"word\"]),\n", + " Field(\n", + " name=\"id\", type=\"string\", indexing=[\"summary\", \"index\"], match=[\"word\"]\n", + " ),\n", " Field(name=\"url\", type=\"string\", indexing=[\"summary\", \"index\"]),\n", - " Field(name=\"title\", type=\"string\", indexing=[\"summary\", \"index\"], match=[\"text\"], index=\"enable-bm25\"),\n", + " Field(\n", + " name=\"title\",\n", + " type=\"string\",\n", + " indexing=[\"summary\", \"index\"],\n", + " match=[\"text\"],\n", + " index=\"enable-bm25\",\n", + " ),\n", " Field(name=\"page_number\", type=\"int\", indexing=[\"summary\", \"attribute\"]),\n", " Field(name=\"image\", type=\"raw\", indexing=[\"summary\"]),\n", - " Field(name=\"text\", type=\"string\", indexing=[\"index\"], match=[\"text\"], index=\"enable-bm25\"),\n", + " Field(\n", + " name=\"text\",\n", + " type=\"string\",\n", + " indexing=[\"index\"],\n", + " match=[\"text\"],\n", + " index=\"enable-bm25\",\n", + " ),\n", " Field(\n", " name=\"embedding\",\n", " type=\"tensor(patch{}, v[16])\",\n", - " indexing=[\"attribute\", \"index\"], # adds HNSW index for candidate retrieval.\n", - " ann=HNSW(distance_metric=\"hamming\", max_links_per_node=32, neighbors_to_explore_at_insert=400), \n", - " )\n", + " indexing=[\n", + " \"attribute\",\n", + " \"index\",\n", + " ], # adds HNSW index for candidate retrieval.\n", + " ann=HNSW(\n", + " distance_metric=\"hamming\",\n", + " max_links_per_node=32,\n", + " neighbors_to_explore_at_insert=400,\n", + " ),\n", + " ),\n", " ]\n", " ),\n", - " fieldsets=[FieldSet(name=\"default\", fields=[\"title\", \"text\"])]\n", - ")\n" + " fieldsets=[FieldSet(name=\"default\", fields=[\"title\", \"text\"])],\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notice the `embedding` field which is a tensor field with the type `tensor(patch{}, v[16])`. \n", - "This is the field we use to represent the page level patch embeddings from ColPali. \n", + "Notice the `embedding` field which is a tensor field with the type `tensor(patch{}, v[16])`.\n", + "This is the field we use to represent the page level patch embeddings from ColPali.\n", "\n", - "We also enable [HNSW indexing](https://docs.vespa.ai/en/approximate-nn-hnsw.html) \n", - "for this field to enable fast nearest neighbor search which is used for candidate retrieval. \n", + "We also enable [HNSW indexing](https://docs.vespa.ai/en/approximate-nn-hnsw.html)\n", + "for this field to enable fast nearest neighbor search which is used for candidate retrieval.\n", "\n", - "We use [binary hamming distance](https://docs.vespa.ai/en/nearest-neighbor-search.html#using-binary-embeddings-with-hamming-distance) \n", + "We use [binary hamming distance](https://docs.vespa.ai/en/nearest-neighbor-search.html#using-binary-embeddings-with-hamming-distance)\n", "as an approximation of the cosine similarity. Hamming distance is a good approximation\n", - "for binary representations, and it is much faster to compute than cosine similarity/dot product. \n", + "for binary representations, and it is much faster to compute than cosine similarity/dot product.\n", "\n", - "The `embedding` field is an example of a mixed tensor where we combine one mapped (sparse) dimensions with a dense dimension. \n", + "The `embedding` field is an example of a mixed tensor where we combine one mapped (sparse) dimensions with a dense dimension.\n", "\n", - "Read more in [Tensor guide](https://docs.vespa.ai/en/tensor-user-guide.html). We also enable [BM25](https://docs.vespa.ai/en/reference/bm25.html) for the `title` and `texts` fields. Notice that the `image` field use type `raw` to store the binary image data, encoded with as a base64 string. \n", + "Read more in [Tensor guide](https://docs.vespa.ai/en/tensor-user-guide.html). We also enable [BM25](https://docs.vespa.ai/en/reference/bm25.html) for the `title` and `texts` fields. Notice that the `image` field use type `raw` to store the binary image data, encoded with as a base64 string.\n", "\n", - "Create the Vespa [application package](https://docs.vespa.ai/en/application-packages): " + "Create the Vespa [application package](https://docs.vespa.ai/en/application-packages):\n" ] }, { @@ -725,11 +753,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we define how we want to rank the pages for a query. We use Vespa's support for [BM25](https://docs.vespa.ai/en/reference/bm25.html) for the text, and \n", - "late interaction with Max Sim for the image embeddings. \n", + "Now we define how we want to rank the pages for a query. We use Vespa's support for [BM25](https://docs.vespa.ai/en/reference/bm25.html) for the text, and\n", + "late interaction with Max Sim for the image embeddings.\n", "\n", - "This means that we use the the text representations as a candidate retrieval phase, then we use the ColPALI embeddings with MaxSim \n", - "to rerank the pages. " + "This means that we use the the text representations as a candidate retrieval phase, then we use the ColPALI embeddings with MaxSim\n", + "to rerank the pages.\n" ] }, { @@ -758,12 +786,10 @@ " )\n", " \"\"\",\n", " ),\n", - " Function(\n", - " name=\"bm25_score\", expression=\"bm25(title) + bm25(text)\"\n", - " )\n", + " Function(name=\"bm25_score\", expression=\"bm25(title) + bm25(text)\"),\n", " ],\n", " first_phase=FirstPhaseRanking(expression=\"bm25_score\"),\n", - " second_phase=SecondPhaseRanking(expression=\"max_sim\", rerank_count=100)\n", + " second_phase=SecondPhaseRanking(expression=\"max_sim\", rerank_count=100),\n", ")\n", "colpali_schema.add_rank_profile(colpali_profile)" ] @@ -772,7 +798,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The first phase uses a linear combination of BM25 scores for the text fields, and the second phase uses the MaxSim function with the image embeddings. Notice that Vespa supports a `unpack_bits` function to convert the 16 compressed binary vectors to 128-dimensional floats for the MaxSim function. The query input tensor is not compressed and using full float resolution." + "The first phase uses a linear combination of BM25 scores for the text fields, and the second phase uses the MaxSim function with the image embeddings. Notice that Vespa supports a `unpack_bits` function to convert the 16 compressed binary vectors to 128-dimensional floats for the MaxSim function. The query input tensor is not compressed and using full float resolution.\n" ] }, { @@ -781,7 +807,7 @@ "source": [ "### Deploy the application to Vespa Cloud\n", "\n", - "With the configured application, we can deploy it to [Vespa Cloud](https://cloud.vespa.ai/en/)." + "With the configured application, we can deploy it to [Vespa Cloud](https://cloud.vespa.ai/en/).\n" ] }, { @@ -803,10 +829,10 @@ "from vespa.deployment import VespaCloud\n", "import os\n", "\n", - "os.environ['TOKENIZERS_PARALLELISM'] = \"false\"\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", "# Replace with your tenant name from the Vespa Cloud Console\n", - "tenant_name = \"vespa-team\" \n", + "tenant_name = \"vespa-team\"\n", "\n", "key = os.getenv(\"VESPA_TEAM_API_KEY\", None)\n", "if key is not None:\n", @@ -861,7 +887,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Index the documents in Vespa using the Vespa HTTP API. " + "Index the documents in Vespa using the Vespa HTTP API.\n" ] }, { @@ -883,7 +909,7 @@ "async with app.asyncio(connections=1, total_timeout=180) as session:\n", " for page in tqdm(vespa_feed):\n", " response: VespaResponse = await session.feed_data_point(\n", - " data_id=page['id'], fields=page, schema=\"pdf_page\"\n", + " data_id=page[\"id\"], fields=page, schema=\"pdf_page\"\n", " )\n", " if not response.is_successful():\n", " print(response.json())" @@ -897,8 +923,8 @@ "source": [ "### Querying Vespa\n", "\n", - "Ok, so now we have indexed the PDF pages in Vespa. Let us now obtain ColPali embeddings for a few text queries and \n", - "use it during ranking of the indexed pdf pages. " + "Ok, so now we have indexed the PDF pages in Vespa. Let us now obtain ColPali embeddings for a few text queries and\n", + "use it during ranking of the indexed pdf pages.\n" ] }, { @@ -916,15 +942,18 @@ }, "outputs": [], "source": [ - "queries = [\"Percentage of non-fresh water as source?\", \n", - " \"Policies related to nature risk?\", \"How much of produced water is recycled?\"]" + "queries = [\n", + " \"Percentage of non-fresh water as source?\",\n", + " \"Policies related to nature risk?\",\n", + " \"How much of produced water is recycled?\",\n", + "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Obtain the query embeddings using the ColPali model:" + "Obtain the query embeddings using the ColPali model:\n" ] }, { @@ -936,17 +965,17 @@ "outputs": [], "source": [ "dataloader = DataLoader(\n", - " queries,\n", - " batch_size=1,\n", - " shuffle=False,\n", - " collate_fn=lambda x: processor.process_queries(x),\n", - " )\n", + " queries,\n", + " batch_size=1,\n", + " shuffle=False,\n", + " collate_fn=lambda x: processor.process_queries(x),\n", + ")\n", "qs = []\n", "for batch_query in dataloader:\n", - " with torch.no_grad():\n", - " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", - " embeddings_query = model(**batch_query)\n", - " qs.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))\n" + " with torch.no_grad():\n", + " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", + " embeddings_query = model(**batch_query)\n", + " qs.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))" ] }, { @@ -964,24 +993,25 @@ "source": [ "from IPython.display import display, HTML\n", "\n", + "\n", "def display_query_results(query, response, hits=5):\n", - " \n", - " \n", - " query_time = response.json.get('timing', {}).get('searchtime', -1)\n", + " query_time = response.json.get(\"timing\", {}).get(\"searchtime\", -1)\n", " query_time = round(query_time, 2)\n", - " count = response.json.get('root', {}).get('fields', {}).get('totalCount', 0)\n", - " html_content = f'

Query text: \\'{query}\\', query time {query_time}s, count={count}, top results:

'\n", - " \n", - " for i, hit in enumerate(response.hits[:hits]): \n", - " title = hit['fields']['title']\n", - " url = hit['fields']['url']\n", - " page = hit['fields']['page_number']\n", - " image = hit['fields']['image']\n", - " score = hit['relevance']\n", - " \n", - " html_content += f'

PDF Result {i + 1}

'\n", + " count = response.json.get(\"root\", {}).get(\"fields\", {}).get(\"totalCount\", 0)\n", + " html_content = f\"

Query text: '{query}', query time {query_time}s, count={count}, top results:

\"\n", + "\n", + " for i, hit in enumerate(response.hits[:hits]):\n", + " title = hit[\"fields\"][\"title\"]\n", + " url = hit[\"fields\"][\"url\"]\n", + " page = hit[\"fields\"][\"page_number\"]\n", + " image = hit[\"fields\"][\"image\"]\n", + " score = hit[\"relevance\"]\n", + "\n", + " html_content += f\"

PDF Result {i + 1}

\"\n", " html_content += f'

Title: {title}, page {page+1} with score {score:.2f}

'\n", - " html_content += f''\n", + " html_content += (\n", + " f''\n", + " )\n", "\n", " display(HTML(html_content))" ] @@ -994,7 +1024,7 @@ "\n", "Note that we retrieve using textual representation with `userInput(@userQuery)`, this means that we use the BM25 ranking for the extracted text in the first ranking phase and then re-rank the top-k pages using the ColPali embeddings.\n", "\n", - "Later in this notebook we will use Vespa's support for approximate nearest neighbor search (`nearestNeighbor`) to retrieve directly using the ColPali embeddings." + "Later in this notebook we will use Vespa's support for approximate nearest neighbor search (`nearestNeighbor`) to retrieve directly using the ColPali embeddings.\n" ] }, { @@ -1051,10 +1081,7 @@ " userQuery=query,\n", " timeout=120,\n", " hits=3,\n", - " body={\n", - " \"input.query(qt)\": query_embedding,\n", - " \"presentation.timing\": True\n", - " },\n", + " body={\"input.query(qt)\": query_embedding, \"presentation.timing\": True},\n", " )\n", " assert response.is_successful()\n", " display_query_results(query, response)" @@ -1066,19 +1093,19 @@ "source": [ "### Using nearestNeighbor for retrieval\n", "\n", - "In the above example, we used the ColPali embeddings in ranking, but using the text query for retrieval. \n", + "In the above example, we used the ColPali embeddings in ranking, but using the text query for retrieval.\n", "This is a reasonable approach for text-heavy documents where the text representation is the most important and where ColPali embeddings are used to\n", - "re-rank the top-k documents from the text retrieval phase. \n", + "re-rank the top-k documents from the text retrieval phase.\n", "\n", "In some cases, the ColPali embeddings are the most important and we want\n", "to demonstrate how we can use HNSW indexing with binary hamming distance to retrieve the most similar pages to a query and\n", - "then have two steps of re-ranking using the ColPali embeddings. \n", + "then have two steps of re-ranking using the ColPali embeddings.\n", "\n", "All the phases here are executed locally inside the Vespa content node(s) so that no vector data needs\n", "to cross the network.\n", "\n", "Let us add a new rank-profile to the schema, the `nearestNeighbor` operator takes a query tensor and a field tensor as argument and\n", - "we need to define the query tensors types in the rank-profile. " + "we need to define the query tensors types in the rank-profile.\n" ] }, { @@ -1091,7 +1118,7 @@ "\n", "input_query_tensors = []\n", "MAX_QUERY_TERMS = 64\n", - "for i in range(MAX_QUERY_TERMS): \n", + "for i in range(MAX_QUERY_TERMS):\n", " input_query_tensors.append((f\"query(rq{i})\", \"tensor(v[16])\"))\n", "\n", "input_query_tensors.append((\"query(qt)\", \"tensor(querytoken{}, v[128])\"))\n", @@ -1129,10 +1156,10 @@ " querytoken\n", " )\n", " \"\"\",\n", - " )\n", + " ),\n", " ],\n", " first_phase=FirstPhaseRanking(expression=\"max_sim_binary\"),\n", - " second_phase=SecondPhaseRanking(expression=\"max_sim\", rerank_count=10)\n", + " second_phase=SecondPhaseRanking(expression=\"max_sim\", rerank_count=10),\n", ")\n", "colpali_schema.add_rank_profile(colpali_retrieval_profile)" ] @@ -1141,7 +1168,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We define two functions, one for the first phase and one for the second phase. Instead of the float representations, we use the binary representations with inverted hamming distance in the first phase. Now, we need to re-deploy the application to Vespa Cloud." + "We define two functions, one for the first phase and one for the second phase. Instead of the float representations, we use the binary representations with inverted hamming distance in the first phase. Now, we need to re-deploy the application to Vespa Cloud.\n" ] }, { @@ -1159,7 +1186,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can query Vespa with the text queries and use the `nearestNeighbor` operator to retrieve the most similar pages to the query and pass the different query tensors. " + "Now we can query Vespa with the text queries and use the `nearestNeighbor` operator to retrieve the most similar pages to the query and pass the different query tensors.\n" ] }, { @@ -1206,26 +1233,33 @@ ], "source": [ "from vespa.io import VespaQueryResponse\n", - "target_hits_per_query_tensor = 20 # this is a hyper parameter that can be tuned for speed versus accuracy\n", + "\n", + "target_hits_per_query_tensor = (\n", + " 20 # this is a hyper parameter that can be tuned for speed versus accuracy\n", + ")\n", "async with app.asyncio(connections=1, total_timeout=180) as session:\n", " for idx, query in enumerate(queries):\n", " float_query_embedding = {k: v.tolist() for k, v in enumerate(qs[idx])}\n", " binary_query_embeddings = dict()\n", " for k, v in float_query_embedding.items():\n", - " binary_query_embeddings[k] = np.packbits(np.where(np.array(v) > 0, 1, 0)).astype(np.int8).tolist()\n", - " \n", + " binary_query_embeddings[k] = (\n", + " np.packbits(np.where(np.array(v) > 0, 1, 0)).astype(np.int8).tolist()\n", + " )\n", + "\n", " # The mixed tensors used in MaxSim calculations\n", - " # We use both binary and float representations \n", - " query_tensors={\n", - " \"input.query(qtb)\": binary_query_embeddings,\n", - " \"input.query(qt)\": float_query_embedding\n", + " # We use both binary and float representations\n", + " query_tensors = {\n", + " \"input.query(qtb)\": binary_query_embeddings,\n", + " \"input.query(qt)\": float_query_embedding,\n", " }\n", " # The query tensors used in the nearest neighbor calculations\n", - " for i in range(0,len(binary_query_embeddings)):\n", + " for i in range(0, len(binary_query_embeddings)):\n", " query_tensors[f\"input.query(rq{i})\"] = binary_query_embeddings[i]\n", " nn = []\n", - " for i in range(0,len(binary_query_embeddings)):\n", - " nn.append(f\"({{targetHits:{target_hits_per_query_tensor}}}nearestNeighbor(embedding,rq{i}))\")\n", + " for i in range(0, len(binary_query_embeddings)):\n", + " nn.append(\n", + " f\"({{targetHits:{target_hits_per_query_tensor}}}nearestNeighbor(embedding,rq{i}))\"\n", + " )\n", " # We use a OR operator to combine the nearest neighbor operator\n", " nn = \" OR \".join(nn)\n", " response: VespaQueryResponse = await session.query(\n", @@ -1233,10 +1267,7 @@ " ranking=\"retrieval-and-rerank\",\n", " timeout=120,\n", " hits=3,\n", - " body={\n", - " **query_tensors,\n", - " \"presentation.timing\": True\n", - " }\n", + " body={**query_tensors, \"presentation.timing\": True},\n", " )\n", " assert response.is_successful()\n", " display_query_results(query, response)" @@ -1246,23 +1277,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Depending on the scale, we can evaluate changing different number of targetHits per nearestNeighbor operator and the ranking depths in the two phases. \n", - "We can also parallelize the ranking phases by using more threads per query request to reduce latency." + "Depending on the scale, we can evaluate changing different number of targetHits per nearestNeighbor operator and the ranking depths in the two phases.\n", + "We can also parallelize the ranking phases by using more threads per query request to reduce latency.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Summary \n", + "## Summary\n", "\n", - "In this notebook, we have demonstrated how to represent the new ColQwen2 in Vespa. \n", - "We have generated embeddings for images of PDF pages using ColQwen2 and stored the embeddings in Vespa using [mixed tensors](https://docs.vespa.ai/en/tensor-user-guide.html). \n", + "In this notebook, we have demonstrated how to represent the new ColQwen2 in Vespa.\n", + "We have generated embeddings for images of PDF pages using ColQwen2 and stored the embeddings in Vespa using [mixed tensors](https://docs.vespa.ai/en/tensor-user-guide.html).\n", "\n", - "We demonstrated how to store the base64 encoded image using the `raw` Vespa field type, plus meta data like title and url. \n", - "We have demonstrated how to retrieve relevant pages for a query using the embeddings generated by ColPali. \n", + "We demonstrated how to store the base64 encoded image using the `raw` Vespa field type, plus meta data like title and url.\n", + "We have demonstrated how to retrieve relevant pages for a query using the embeddings generated by ColPali.\n", "\n", - "This notebook can be extended to include more complex ranking models, more complex queries, and more complex data structures, including metadata and other fields which can be filtered on or used for ranking." + "This notebook can be extended to include more complex ranking models, more complex queries, and more complex data structures, including metadata and other fields which can be filtered on or used for ranking.\n" ] } ], From c0782b93fdc7a5195585d7a00b0790ef6c02bbf3 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Wed, 2 Oct 2024 11:22:15 +0200 Subject: [PATCH 3/3] exclude ColQwen notebook from ci --- .github/workflows/notebooks-cloud.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/notebooks-cloud.yml b/.github/workflows/notebooks-cloud.yml index de5607ca..f2bb5af2 100644 --- a/.github/workflows/notebooks-cloud.yml +++ b/.github/workflows/notebooks-cloud.yml @@ -26,7 +26,7 @@ jobs: - name: Set output variable (Make sure it is this quote format - "[path/to/notebook1.ipynb", "path/to/notebook2.ipynb]") id: set_output run: | - notebooks=$(find docs/sphinx/source -name '*cloud.ipynb' ! -name 'mother-of-all-embedding-models-cloud.ipynb' ! -name 'scaling-personal-ai-assistants-with-streaming-mode-cloud.ipynb' ! -name 'colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb' | jq -R -s -c 'split("\n")[:-1]') + notebooks=$(find docs/sphinx/source -name '*cloud.ipynb' ! -name 'pdf-retrieval-with-ColQwen2-vlm_Vespa-cloud.ipynb' ! -name 'mother-of-all-embedding-models-cloud.ipynb' ! -name 'scaling-personal-ai-assistants-with-streaming-mode-cloud.ipynb' ! -name 'colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb' | jq -R -s -c 'split("\n")[:-1]') # Print all notebooks echo echo $notebooks echo "notebooks=$notebooks" >> $GITHUB_OUTPUT