From 855cf4853cf337b72b0f5ca3fd8ade5526c5adcd Mon Sep 17 00:00:00 2001 From: jeadie Date: Thu, 19 Dec 2024 09:41:10 +1000 Subject: [PATCH 1/7] fix naming of titles and in sidebar --- .../features/large-language-models/model-serving.md | 0 .../docs/features/machine-learning-models/index.md | 10 ++++++++++ .../{ml-model-serving/index.md => serving.md} | 0 3 files changed, 10 insertions(+) delete mode 100644 spiceaidocs/docs/features/large-language-models/model-serving.md create mode 100644 spiceaidocs/docs/features/machine-learning-models/index.md rename spiceaidocs/docs/features/machine-learning-models/{ml-model-serving/index.md => serving.md} (100%) diff --git a/spiceaidocs/docs/features/large-language-models/model-serving.md b/spiceaidocs/docs/features/large-language-models/model-serving.md deleted file mode 100644 index e69de29b..00000000 diff --git a/spiceaidocs/docs/features/machine-learning-models/index.md b/spiceaidocs/docs/features/machine-learning-models/index.md new file mode 100644 index 00000000..c59de2ad --- /dev/null +++ b/spiceaidocs/docs/features/machine-learning-models/index.md @@ -0,0 +1,10 @@ +--- +title: 'Machine Learning Models' +sidebar_label: 'Machine Learning Models' +pagination_prev: null +pagination_next: null +--- + +import DocCardList from '@theme/DocCardList'; + + diff --git a/spiceaidocs/docs/features/machine-learning-models/ml-model-serving/index.md b/spiceaidocs/docs/features/machine-learning-models/serving.md similarity index 100% rename from spiceaidocs/docs/features/machine-learning-models/ml-model-serving/index.md rename to spiceaidocs/docs/features/machine-learning-models/serving.md From 5378fae61d2c9538a6b92a6021d3c5414bb20107 Mon Sep 17 00:00:00 2001 From: jeadie Date: Sat, 21 Dec 2024 11:41:36 +1000 Subject: [PATCH 2/7] embedding feature docs --- spiceaidocs/docs/features/embeddings/index.md | 222 ++++++++++++++++++ .../features/machine-learning-models/index.md | 1 + spiceaidocs/docs/features/search/index.md | 22 +- 3 files changed, 224 insertions(+), 21 deletions(-) create mode 100644 spiceaidocs/docs/features/embeddings/index.md diff --git a/spiceaidocs/docs/features/embeddings/index.md b/spiceaidocs/docs/features/embeddings/index.md new file mode 100644 index 00000000..2d55bde2 --- /dev/null +++ b/spiceaidocs/docs/features/embeddings/index.md @@ -0,0 +1,222 @@ +--- +title: 'Embedding Datasets' +sidebar_label: 'Embedding Datasets' +description: 'Learn how to define, or augment existing datasets with embedding column(s).' +sidebar_position: 11 +pagination_prev: null +pagination_next: null +--- + +# Embedding Datasets + +Learn how to define and augment datasets with embedding columns for advanced search capabilities. + +## Overview + +Spice supports three methods for working with embeddings in datasets: + +1. **Passthrough Embeddings**: Using existing embeddings from the underlying source datasets. +2. **Just-in-Time (JIT) Embeddings**: Compute embeddings for the dataset, on-demand, during query execution. +3. **Accelerated Embeddings**: Precompute embeddings by accelerating the source dataset. + +## Configuring Embedding Models + +Before configuring dataset embeddings, you must define the embedding models in your `spicepod.yaml`, for example: + +```yaml +embeddings: + - name: local_embedding_model + from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 + + - from: openai + name: remote_service + params: + openai_api_key: ${ secrets:SPICE_OPENAI_API_KEY } +``` + +See [Embedding components](/components/embeddings/) for more information on embedding models. + +## Embedding Methods + +### Pass-through Embeddings + +Datasets that already include embeddings can utilize the same functionalities (e.g., vector search) as those augmented with embeddings using Spice. To ensure compatibility, these table columns must adhere to the following constraints: + +#### Example + +A `sales` table with an `address` column that has an embedding. + +```shell +sql> describe sales; ++-------------------+-----------------------------------------+-------------+ +| column_name | data_type | is_nullable | ++-------------------+-----------------------------------------+-------------+ +| order_number | Int64 | YES | +| quantity_ordered | Int64 | YES | +| price_each | Float64 | YES | +| order_line_number | Int64 | YES | +| address | Utf8 | YES | +| address_embedding | FixedSizeList( | NO | +| | Field { | | +| | name: "item", | | +| | data_type: Float32, | | +| | nullable: false, | | +| | dict_id: 0, | | +| | dict_is_ordered: false, | | +| | metadata: {} | | +| | }, | | +| | 384 | | ++-------------------+-----------------------------------------+-------------+ +``` + +The same table if it was chunked: + +```shell +sql> describe sales; ++-------------------+-----------------------------------------+-------------+ +| column_name | data_type | is_nullable | ++-------------------+-----------------------------------------+-------------+ +| order_number | Int64 | YES | +| quantity_ordered | Int64 | YES | +| price_each | Float64 | YES | +| order_line_number | Int64 | YES | +| address | Utf8 | YES | +| address_embedding | List(Field { | NO | +| | name: "item", | | +| | data_type: FixedSizeList( | | +| | Field { | | +| | name: "item", | | +| | data_type: Float32, | | +| | }, | | +| | 384 | | +| | ), | | +| | }) | | ++-------------------+-----------------------------------------+-------------+ +| address_offset | List(Field { | NO | +| | name: "item", | | +| | data_type: FixedSizeList( | | +| | Field { | | +| | name: "item", | | +| | data_type: Int32, | | +| | }, | | +| | 2 | | +| | ), | | +| | }) | | ++-------------------+-----------------------------------------+-------------+ +``` + +Passthrough embedding columns still must be defined in the `spicepod.yaml` file. The spicepod must also have access to the same embedding model used to generate the embeddings. +```yaml +datasets: + - from: sftp://remote-sftp-server.com/sales/2024.csv + name: sales + columns: + - name: address + embeddings: + - from: local_embedding_model # Original embedding model used for this column +``` + +#### Requirements +1. **Underlying Column Presence:** + - The underlying column must exist in the table, and be of `string` [Arrow data type](reference/datatypes.md) . + +2. **Embeddings Column Naming Convention:** + - For each underlying column, the corresponding embeddings column must be named as `_embedding`. For example, a `customer_reviews` table with a `review` column must have a `review_embedding` column. + +3. **Embeddings Column Data Type:** + - The embeddings column must have the following [Arrow data type](reference/datatypes.md) when loaded into Spice: + 1. `FixedSizeList[Float32 or Float64, N]`, where `N` is the dimension (size) of the embedding vector. `FixedSizeList` is used for efficient storage and processing of fixed-size vectors. + 2. If the column is [**chunked**](#chunking-support), use `List[FixedSizeList[Float32 or Float64, N]]`. + +4. **Offset Column for Chunked Data:** + - If the underlying column is chunked, there must be an additional offset column named `_offsets` with the following Arrow data type: + 1. `List[FixedSizeList[Int32, 2]]`, where each element is a pair of integers `[start, end]` representing the start and end indices of the chunk in the underlying text column. This offset column maps each chunk in the embeddings back to the corresponding segment in the underlying text column. + - _For instance, `[[0, 100], [101, 200]]` indicates two chunks covering indices 0–100 and 101–200, respectively._ + +By following these guidelines, you can ensure that your dataset with pre-existing embeddings is fully compatible with the vector search and other embedding functionalities provided by Spice. + +### Just-in-Time (JIT) Embeddings + +JIT embeddings are computed during query execution. This is useful when you can't or don't want to pre-compute embeddings (e.g. if the dataset is large, infrequently queried, has heavy prefiltering). To add an embedding column, specify it within the dataset's column. + +```yaml +datasets: + - name: invoices + from: sftp://remote-sftp-server.com/invoices/2024/ + columns: + - name: line_item_details + embeddings: + - from: my_embedding_model + params: + file_format: parquet + +embeddings: + # Or any model you like! + - from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 + name: my_embedding_model +``` + +### Accelerated Embeddings +To improve query performance, column embeddings can be precomputed, and stored in any [data accelerator](/components/data-accelerators/index.md). The only change required for this it to set up the data accelerator. For example, just add +```yaml +acceleration: + enabled: true +``` +to the dataset configuration. All other data accelerator configurations are optional, but can be applied. + +**Full example:** +```yaml +datasets: + - name: invoices + from: sftp://remote-sftp-server.com/invoices/2024/ + acceleration: + enabled: true + columns: + - name: line_item_details + embeddings: + - from: my_embedding_model + params: + file_format: parquet +``` + +## Advanced Configuration + +### Chunking + +Spice also supports chunking of content before embedding, which is useful for large text columns such as those found in [Document Tables](/components/data-connectors/index.md#document-support). Chunking ensures that only the most relevant portions of text are returned during search queries. Chunking is configured as part of the embedding configuration. + +```yaml +datasets: + - from: github:github.com/spiceai/spiceai/issues + name: spiceai.issues + acceleration: + enabled: true + columns: + - name: body + embeddings: + - from: local_embedding_model + chunking: + enabled: true + target_chunk_size: 512 +``` + +The `body` column will be divided into chunks of approximately 512 tokens, while maintaining structural and semantic integrity (e.g. not splitting sentences). See the [API reference](/reference/spicepod/datasets.md#columns-embeddings-chunking) for full details. + +#### Row Identifiers + +Like a primary key, the `row_id` field specifies which column(s) uniquely identifies each row. This is useful for embedding datasets that don't have a primary key by default. This is important for chunked embedding datasets, so that operations (e.g. [`v1/search`](/api/http/search)), can be able to map multiple chunked vectors to a single dataset row. The `row_id` can be set in the `columns[*].embeddings[*].row_id`. +```yaml +datasets: + - from: github:github.com/spiceai/spiceai/issues + name: spiceai.issues + acceleration: + enabled: true + columns: + - name: body + embeddings: + - from: local_embedding_model + chunking: + enabled: true + target_chunk_size: 512 + row_id: id +``` diff --git a/spiceaidocs/docs/features/machine-learning-models/index.md b/spiceaidocs/docs/features/machine-learning-models/index.md index c59de2ad..c6ff0e5f 100644 --- a/spiceaidocs/docs/features/machine-learning-models/index.md +++ b/spiceaidocs/docs/features/machine-learning-models/index.md @@ -1,6 +1,7 @@ --- title: 'Machine Learning Models' sidebar_label: 'Machine Learning Models' +sidebar_position: 10 pagination_prev: null pagination_next: null --- diff --git a/spiceaidocs/docs/features/search/index.md b/spiceaidocs/docs/features/search/index.md index 1771997e..2fa6e652 100644 --- a/spiceaidocs/docs/features/search/index.md +++ b/spiceaidocs/docs/features/search/index.md @@ -74,26 +74,6 @@ For more details, see the [API reference for /v1/search](/api/http/search). Spice also supports vector search on datasets with preexisting embeddings. See [below](#preexisting-embeddings) for compatibility details. -### Chunking Support - -Spice also supports chunking of content before embedding, which is useful for large text columns such as those found in [Document Tables](/components/data-connectors/index.md#document-support). Chunking ensures that only the most relevant portions of text are returned during search queries. Chunking is configured as part of the embedding configuration. - -```yaml -datasets: - - from: github:github.com/spiceai/spiceai/issues - name: spiceai.issues - acceleration: - enabled: true - embeddings: - - column: body - from: local_embedding_model - chunking: - enabled: true - target_chunk_size: 512 -``` - -The `body` column will be divided into chunks of approximately 512 tokens, while maintaining structural and semantic integrity (e.g. not splitting sentences). - ### Document Retrieval When performing searches on datasets with chunking enabled, Spice returns the most relevant chunk for each match. To retrieve the full content of a column, include the embedding column in the `additional_columns` list. @@ -156,7 +136,7 @@ Datasets that already include embeddings can utilize the same functionalities (e - The embeddings column must have the following [Arrow data type](reference/datatypes.md) when loaded into Spice: 1. `FixedSizeList[Float32 or Float64, N]`, where `N` is the dimension (size) of the embedding vector. `FixedSizeList` is used for efficient storage and processing of fixed-size vectors. - 2. If the column is [**chunked**](#chunking-support), use `List[FixedSizeList[Float32 or Float64, N]]`. + 2. If the column is [**chunked**](/features/embeddings/index.md#chunking), use `List[FixedSizeList[Float32 or Float64, N]]`. 4. **Offset Column for Chunked Data:** - If the underlying column is chunked, there must be an additional offset column named `_offsets` with the following Arrow data type: From bfb525ddb98c429b90b73f0933d75f86df279cb8 Mon Sep 17 00:00:00 2001 From: jeadie Date: Sat, 21 Dec 2024 11:45:09 +1000 Subject: [PATCH 3/7] linking --- spiceaidocs/docs/features/search/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiceaidocs/docs/features/search/index.md b/spiceaidocs/docs/features/search/index.md index 2fa6e652..9e8ad881 100644 --- a/spiceaidocs/docs/features/search/index.md +++ b/spiceaidocs/docs/features/search/index.md @@ -72,7 +72,7 @@ curl -XPOST http://localhost:8090/v1/search \ For more details, see the [API reference for /v1/search](/api/http/search). -Spice also supports vector search on datasets with preexisting embeddings. See [below](#preexisting-embeddings) for compatibility details. +Spice also supports vector search on datasets with preexisting embeddings. See [below](/features/embeddings/index.md#passthrough-embeddings) for compatibility details. ### Document Retrieval From 2948c3645e03aae710862a23aa018e50f95f14f1 Mon Sep 17 00:00:00 2001 From: jeadie Date: Sat, 21 Dec 2024 12:00:44 +1000 Subject: [PATCH 4/7] linking --- spiceaidocs/docs/features/embeddings/index.md | 6 +++--- spiceaidocs/docs/features/search/index.md | 4 ++-- spiceaidocs/docs/reference/spicepod/datasets.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spiceaidocs/docs/features/embeddings/index.md b/spiceaidocs/docs/features/embeddings/index.md index 2d55bde2..7789ed5d 100644 --- a/spiceaidocs/docs/features/embeddings/index.md +++ b/spiceaidocs/docs/features/embeddings/index.md @@ -38,7 +38,7 @@ See [Embedding components](/components/embeddings/) for more information on embe ## Embedding Methods -### Pass-through Embeddings +### Passthrough Embeddings Datasets that already include embeddings can utilize the same functionalities (e.g., vector search) as those augmented with embeddings using Spice. To ensure compatibility, these table columns must adhere to the following constraints: @@ -126,7 +126,7 @@ datasets: 3. **Embeddings Column Data Type:** - The embeddings column must have the following [Arrow data type](reference/datatypes.md) when loaded into Spice: 1. `FixedSizeList[Float32 or Float64, N]`, where `N` is the dimension (size) of the embedding vector. `FixedSizeList` is used for efficient storage and processing of fixed-size vectors. - 2. If the column is [**chunked**](#chunking-support), use `List[FixedSizeList[Float32 or Float64, N]]`. + 2. If the column is [**chunked**](#chunking), use `List[FixedSizeList[Float32 or Float64, N]]`. 4. **Offset Column for Chunked Data:** - If the underlying column is chunked, there must be an additional offset column named `_offsets` with the following Arrow data type: @@ -200,7 +200,7 @@ datasets: target_chunk_size: 512 ``` -The `body` column will be divided into chunks of approximately 512 tokens, while maintaining structural and semantic integrity (e.g. not splitting sentences). See the [API reference](/reference/spicepod/datasets.md#columns-embeddings-chunking) for full details. +The `body` column will be divided into chunks of approximately 512 tokens, while maintaining structural and semantic integrity (e.g. not splitting sentences). See the [API reference](/reference/spicepod/datasets#columns-embeddings-chunking) for full details. #### Row Identifiers diff --git a/spiceaidocs/docs/features/search/index.md b/spiceaidocs/docs/features/search/index.md index 9e8ad881..b40dce18 100644 --- a/spiceaidocs/docs/features/search/index.md +++ b/spiceaidocs/docs/features/search/index.md @@ -72,7 +72,7 @@ curl -XPOST http://localhost:8090/v1/search \ For more details, see the [API reference for /v1/search](/api/http/search). -Spice also supports vector search on datasets with preexisting embeddings. See [below](/features/embeddings/index.md#passthrough-embeddings) for compatibility details. +Spice also supports vector search on datasets with preexisting embeddings. See [below](/features/embeddings#passthrough-embeddings) for compatibility details. ### Document Retrieval @@ -136,7 +136,7 @@ Datasets that already include embeddings can utilize the same functionalities (e - The embeddings column must have the following [Arrow data type](reference/datatypes.md) when loaded into Spice: 1. `FixedSizeList[Float32 or Float64, N]`, where `N` is the dimension (size) of the embedding vector. `FixedSizeList` is used for efficient storage and processing of fixed-size vectors. - 2. If the column is [**chunked**](/features/embeddings/index.md#chunking), use `List[FixedSizeList[Float32 or Float64, N]]`. + 2. If the column is [**chunked**](/features/embeddings#chunking), use `List[FixedSizeList[Float32 or Float64, N]]`. 4. **Offset Column for Chunked Data:** - If the underlying column is chunked, there must be an additional offset column named `_offsets` with the following Arrow data type: diff --git a/spiceaidocs/docs/reference/spicepod/datasets.md b/spiceaidocs/docs/reference/spicepod/datasets.md index c14c5af2..18f21dba 100644 --- a/spiceaidocs/docs/reference/spicepod/datasets.md +++ b/spiceaidocs/docs/reference/spicepod/datasets.md @@ -422,7 +422,7 @@ Optional. For datasets without a primary key, used to explicitly specify column( Specifying a `row_id` enables unique identifier lookups for datasets from external systems that may not have a primary key. -## `columns[*].embeddings[*].chunking` +## `columns[*].embeddings[*].chunking` {#columns-embeddings-chunking} Optional. The configuration to enable and define the chunking strategy for the embedding column. @@ -501,7 +501,7 @@ Optional. The number of tokens to overlap between chunks. Defaults to `0`. Optional. If enabled, the content of each chunk will be trimmed to remove leading and trailing whitespace. Defaults to `true`. -## `metdata` +## `metdata` {#metadata} Optional. Additional key-value metadata for the dataset. Used as part of the [Semantic Data Model](/features/semantic-model/index.md). From 6dd06b068d2c7a804e2941c7d0a44ce7cfcf1dcc Mon Sep 17 00:00:00 2001 From: jeadie Date: Sat, 21 Dec 2024 12:03:38 +1000 Subject: [PATCH 5/7] last link --- spiceaidocs/docs/api/http/search.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiceaidocs/docs/api/http/search.md b/spiceaidocs/docs/api/http/search.md index e43c8758..e95be51d 100644 --- a/spiceaidocs/docs/api/http/search.md +++ b/spiceaidocs/docs/api/http/search.md @@ -76,4 +76,4 @@ Response } ``` -The `v1/search` endpoint supports [chunked](/features/search/index.md#chunking) embedding columns. +The `v1/search` endpoint supports [chunked](/features/embeddings#chunking) embedding columns. From 5b99b8bdfa260d9fc54dc2a42288e7824a6f6d48 Mon Sep 17 00:00:00 2001 From: jeadie Date: Sat, 21 Dec 2024 12:27:00 +1000 Subject: [PATCH 6/7] initial review --- spiceaidocs/docs/features/embeddings/index.md | 110 +++++++++--------- spiceaidocs/docs/reference/spicepod/index.md | 2 +- 2 files changed, 58 insertions(+), 54 deletions(-) diff --git a/spiceaidocs/docs/features/embeddings/index.md b/spiceaidocs/docs/features/embeddings/index.md index 7789ed5d..a938cdb8 100644 --- a/spiceaidocs/docs/features/embeddings/index.md +++ b/spiceaidocs/docs/features/embeddings/index.md @@ -13,11 +13,11 @@ Learn how to define and augment datasets with embedding columns for advanced sea ## Overview -Spice supports three methods for working with embeddings in datasets: +Spice supports three methods for working with embedding columns within datasets: -1. **Passthrough Embeddings**: Using existing embeddings from the underlying source datasets. -2. **Just-in-Time (JIT) Embeddings**: Compute embeddings for the dataset, on-demand, during query execution. -3. **Accelerated Embeddings**: Precompute embeddings by accelerating the source dataset. +1. [**Just-in-Time (JIT) Embeddings**](#jit-embeddings): Computes embeddings for the dataset, on-demand, during query execution. +2. [**Accelerated Embeddings**](#accelerated-embeddings): Precompute embeddings by accelerating the source dataset. +3. [**Passthrough Embeddings**](#passthrough-embeddings): Uses the existing embeddings from the underlying source datasets. ## Configuring Embedding Models @@ -37,10 +37,53 @@ embeddings: See [Embedding components](/components/embeddings/) for more information on embedding models. ## Embedding Methods +### Just-in-Time (JIT) Embeddings {#jit-embeddings} + +JIT embeddings are computed during query execution. This is useful when you can't or don't want to pre-compute embeddings (e.g. if the dataset is large, infrequently queried, has heavy prefiltering). To add an embedding column, specify it within the dataset's column. + +```yaml +datasets: + - name: invoices + from: sftp://remote-sftp-server.com/invoices/2024/ + columns: + - name: line_item_details + embeddings: + - from: my_embedding_model + params: + file_format: parquet + +embeddings: + # Or any model you like! + - from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 + name: my_embedding_model +``` + +### Accelerated Embeddings +To improve query performance, column embeddings can be precomputed, and stored in any [data accelerator](/components/data-accelerators/index.md). The only change required for this it to set up the data accelerator. For example, just add +```yaml +acceleration: + enabled: true +``` +to the dataset configuration. All other data accelerator configurations are optional, but can be applied as per their respective [documentation](/components/data-accelerators/index.md). + +**Full example:** +```yaml +datasets: + - name: invoices + from: sftp://remote-sftp-server.com/invoices/2024/ + acceleration: + enabled: true + columns: + - name: line_item_details + embeddings: + - from: my_embedding_model + params: + file_format: parquet +``` ### Passthrough Embeddings -Datasets that already include embeddings can utilize the same functionalities (e.g., vector search) as those augmented with embeddings using Spice. To ensure compatibility, these table columns must adhere to the following constraints: +Datasets that already have embedding columns can utilize the same functionalities (e.g. vector search) as those augmented with Spice-generated embeddings. They should follow the same schema as Spice-generated embeddings (or be altered with a [view](/reference/spicepod#view). #### Example @@ -105,7 +148,7 @@ sql> describe sales; +-------------------+-----------------------------------------+-------------+ ``` -Passthrough embedding columns still must be defined in the `spicepod.yaml` file. The spicepod must also have access to the same embedding model used to generate the embeddings. +Passthrough embedding columns must still be defined in the `spicepod.yaml` file. The spice instance must also have access to the same embedding model used to generate the embeddings. ```yaml datasets: - from: sftp://remote-sftp-server.com/sales/2024.csv @@ -113,10 +156,16 @@ datasets: columns: - name: address embeddings: - - from: local_embedding_model # Original embedding model used for this column + - from: local_embedding_model + +embeddings: + - name: local_embedding_model # Original embedding model used for this column + ... ``` #### Requirements +To ensure compatibility, these table columns must adhere to the following constraints: + 1. **Underlying Column Presence:** - The underlying column must exist in the table, and be of `string` [Arrow data type](reference/datatypes.md) . @@ -133,54 +182,9 @@ datasets: 1. `List[FixedSizeList[Int32, 2]]`, where each element is a pair of integers `[start, end]` representing the start and end indices of the chunk in the underlying text column. This offset column maps each chunk in the embeddings back to the corresponding segment in the underlying text column. - _For instance, `[[0, 100], [101, 200]]` indicates two chunks covering indices 0–100 and 101–200, respectively._ -By following these guidelines, you can ensure that your dataset with pre-existing embeddings is fully compatible with the vector search and other embedding functionalities provided by Spice. - -### Just-in-Time (JIT) Embeddings - -JIT embeddings are computed during query execution. This is useful when you can't or don't want to pre-compute embeddings (e.g. if the dataset is large, infrequently queried, has heavy prefiltering). To add an embedding column, specify it within the dataset's column. - -```yaml -datasets: - - name: invoices - from: sftp://remote-sftp-server.com/invoices/2024/ - columns: - - name: line_item_details - embeddings: - - from: my_embedding_model - params: - file_format: parquet - -embeddings: - # Or any model you like! - - from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 - name: my_embedding_model -``` - -### Accelerated Embeddings -To improve query performance, column embeddings can be precomputed, and stored in any [data accelerator](/components/data-accelerators/index.md). The only change required for this it to set up the data accelerator. For example, just add -```yaml -acceleration: - enabled: true -``` -to the dataset configuration. All other data accelerator configurations are optional, but can be applied. - -**Full example:** -```yaml -datasets: - - name: invoices - from: sftp://remote-sftp-server.com/invoices/2024/ - acceleration: - enabled: true - columns: - - name: line_item_details - embeddings: - - from: my_embedding_model - params: - file_format: parquet -``` +By following these guidelines, you can ensure that your dataset with pre-existing embeddings is fully compatible with embedding functionalities provided by Spice. ## Advanced Configuration - ### Chunking Spice also supports chunking of content before embedding, which is useful for large text columns such as those found in [Document Tables](/components/data-connectors/index.md#document-support). Chunking ensures that only the most relevant portions of text are returned during search queries. Chunking is configured as part of the embedding configuration. diff --git a/spiceaidocs/docs/reference/spicepod/index.md b/spiceaidocs/docs/reference/spicepod/index.md index 6f83858e..edb86b66 100644 --- a/spiceaidocs/docs/reference/spicepod/index.md +++ b/spiceaidocs/docs/reference/spicepod/index.md @@ -314,7 +314,7 @@ dependencies: - spicehq/nfts ``` -## `views` +## `views` {#views} A Spicepod can contain one or more views which are virtual tables defined by SQL queries. From 9b2b723b4663ae09218a57f6d18588b1c73c5109 Mon Sep 17 00:00:00 2001 From: Jack Eadie Date: Tue, 24 Dec 2024 15:18:49 +1000 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Luke Kim <80174+lukekim@users.noreply.github.com> Co-authored-by: Phillip LeBlanc --- spiceaidocs/docs/features/embeddings/index.md | 19 ++++++++++--------- .../docs/reference/spicepod/datasets.md | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/spiceaidocs/docs/features/embeddings/index.md b/spiceaidocs/docs/features/embeddings/index.md index a938cdb8..3f7d73a1 100644 --- a/spiceaidocs/docs/features/embeddings/index.md +++ b/spiceaidocs/docs/features/embeddings/index.md @@ -7,21 +7,21 @@ pagination_prev: null pagination_next: null --- -# Embedding Datasets - Learn how to define and augment datasets with embedding columns for advanced search capabilities. ## Overview -Spice supports three methods for working with embedding columns within datasets: +Spice provides three distinct methods for handling embedding columns in datasets: -1. [**Just-in-Time (JIT) Embeddings**](#jit-embeddings): Computes embeddings for the dataset, on-demand, during query execution. -2. [**Accelerated Embeddings**](#accelerated-embeddings): Precompute embeddings by accelerating the source dataset. -3. [**Passthrough Embeddings**](#passthrough-embeddings): Uses the existing embeddings from the underlying source datasets. +1. **[Just-in-Time (JIT) Embeddings](#jit-embeddings)**: Dynamically computes embeddings, on-demand, during query execution, without precomputing data. + +2. **[Accelerated Embeddings](#accelerated-embeddings)**: Precomputes embeddings by transforming and augmenting the source dataset for faster query and search performance. + +3. **[Passthrough Embeddings](#passthrough-embeddings)**: Utilizes pre-existing embeddings directly from the underlying source datasets, bypassing any additional computation. ## Configuring Embedding Models -Before configuring dataset embeddings, you must define the embedding models in your `spicepod.yaml`, for example: +Before configuring dataset embeddings define the embedding models in the `spicepod.yaml`, for example: ```yaml embeddings: @@ -39,7 +39,7 @@ See [Embedding components](/components/embeddings/) for more information on embe ## Embedding Methods ### Just-in-Time (JIT) Embeddings {#jit-embeddings} -JIT embeddings are computed during query execution. This is useful when you can't or don't want to pre-compute embeddings (e.g. if the dataset is large, infrequently queried, has heavy prefiltering). To add an embedding column, specify it within the dataset's column. +JIT embeddings are computed during query execution. This is useful when pre-computing embeddings is infeasible (e.g. if the dataset is large, infrequently queried, has heavy prefiltering). To add an embedding column, specify it within the dataset's column. ```yaml datasets: @@ -182,7 +182,7 @@ To ensure compatibility, these table columns must adhere to the following constr 1. `List[FixedSizeList[Int32, 2]]`, where each element is a pair of integers `[start, end]` representing the start and end indices of the chunk in the underlying text column. This offset column maps each chunk in the embeddings back to the corresponding segment in the underlying text column. - _For instance, `[[0, 100], [101, 200]]` indicates two chunks covering indices 0–100 and 101–200, respectively._ -By following these guidelines, you can ensure that your dataset with pre-existing embeddings is fully compatible with embedding functionalities provided by Spice. +Following these guidelines ensures that the dataset with pre-existing embeddings is fully compatible with embedding functionalities provided by Spice. ## Advanced Configuration ### Chunking @@ -209,6 +209,7 @@ The `body` column will be divided into chunks of approximately 512 tokens, while #### Row Identifiers Like a primary key, the `row_id` field specifies which column(s) uniquely identifies each row. This is useful for embedding datasets that don't have a primary key by default. This is important for chunked embedding datasets, so that operations (e.g. [`v1/search`](/api/http/search)), can be able to map multiple chunked vectors to a single dataset row. The `row_id` can be set in the `columns[*].embeddings[*].row_id`. + ```yaml datasets: - from: github:github.com/spiceai/spiceai/issues diff --git a/spiceaidocs/docs/reference/spicepod/datasets.md b/spiceaidocs/docs/reference/spicepod/datasets.md index 18f21dba..634cf5e3 100644 --- a/spiceaidocs/docs/reference/spicepod/datasets.md +++ b/spiceaidocs/docs/reference/spicepod/datasets.md @@ -501,7 +501,7 @@ Optional. The number of tokens to overlap between chunks. Defaults to `0`. Optional. If enabled, the content of each chunk will be trimmed to remove leading and trailing whitespace. Defaults to `true`. -## `metdata` {#metadata} +## `metadata` {#metadata} Optional. Additional key-value metadata for the dataset. Used as part of the [Semantic Data Model](/features/semantic-model/index.md).