diff --git a/examples/embeddings/simpleVector/main.go b/examples/embeddings/jsondb/main.go similarity index 76% rename from examples/embeddings/simpleVector/main.go rename to examples/embeddings/jsondb/main.go index 5270def4..d54f1aa6 100644 --- a/examples/embeddings/simpleVector/main.go +++ b/examples/embeddings/jsondb/main.go @@ -7,7 +7,7 @@ import ( openaiembedder "github.com/henomis/lingoose/embedder/openai" "github.com/henomis/lingoose/index" indexoption "github.com/henomis/lingoose/index/option" - simplevectorindex "github.com/henomis/lingoose/index/simpleVectorIndex" + "github.com/henomis/lingoose/index/vectordb/jsondb" "github.com/henomis/lingoose/llm/openai" "github.com/henomis/lingoose/loader" "github.com/henomis/lingoose/prompt" @@ -18,20 +18,22 @@ import ( func main() { - openaiEmbedder := openaiembedder.New(openaiembedder.AdaEmbeddingV2) + index := index.New( + jsondb.New("db.json"), + openaiembedder.New(openaiembedder.AdaEmbeddingV2), + ).WithIncludeContents(true) - docsVectorIndex := simplevectorindex.New("docs", ".", openaiEmbedder) - indexIsEmpty, _ := docsVectorIndex.IsEmpty() + indexIsEmpty, _ := index.IsEmpty(context.Background()) if indexIsEmpty { - err := ingestData(docsVectorIndex, openaiEmbedder) + err := ingestData(index) if err != nil { panic(err) } } query := "What is the purpose of the NATO Alliance?" - similarities, err := docsVectorIndex.Query( + similarities, err := index.Query( context.Background(), query, indexoption.WithTopK(3), @@ -52,7 +54,7 @@ func main() { documentContext += similarity.Content() + "\n\n" } - llmOpenAI := openai.NewCompletion() + llmOpenAI := openai.NewCompletion().WithVerbose(true) prompt1 := prompt.NewPromptTemplate( "Based on the following context answer to the question.\n\nContext:\n{{.context}}\n\nQuestion: {{.query}}").WithInputs( map[string]string{ @@ -74,7 +76,7 @@ func main() { fmt.Println(output) } -func ingestData(docsVectorIndex *simplevectorindex.Index, openaiEmbedder index.Embedder) error { +func ingestData(index *index.Index) error { fmt.Printf("Ingesting data...") @@ -87,7 +89,7 @@ func ingestData(docsVectorIndex *simplevectorindex.Index, openaiEmbedder index.E documentChunks := textSplitter.SplitDocuments(documents) - err = docsVectorIndex.LoadFromDocuments(context.Background(), documentChunks) + err = index.LoadFromDocuments(context.Background(), documentChunks) if err != nil { return err } diff --git a/examples/embeddings/knowledge_base/main.go b/examples/embeddings/knowledge_base/main.go index d3623ac6..2747bf1a 100644 --- a/examples/embeddings/knowledge_base/main.go +++ b/examples/embeddings/knowledge_base/main.go @@ -8,8 +8,10 @@ import ( "github.com/henomis/lingoose/chat" openaiembedder "github.com/henomis/lingoose/embedder/openai" + "github.com/henomis/lingoose/index" indexoption "github.com/henomis/lingoose/index/option" - simplevectorindex "github.com/henomis/lingoose/index/simpleVectorIndex" + + "github.com/henomis/lingoose/index/vectordb/jsondb" "github.com/henomis/lingoose/llm/openai" "github.com/henomis/lingoose/loader" "github.com/henomis/lingoose/prompt" @@ -23,13 +25,15 @@ const ( func main() { - openaiEmbedder := openaiembedder.New(openaiembedder.AdaEmbeddingV2) + index := index.New( + jsondb.New("db.json"), + openaiembedder.New(openaiembedder.AdaEmbeddingV2), + ).WithIncludeContents(true) - docsVectorIndex := simplevectorindex.New("db", ".", openaiEmbedder) - indexIsEmpty, _ := docsVectorIndex.IsEmpty() + indexIsEmpty, _ := index.IsEmpty(context.Background()) if indexIsEmpty { - err := ingestData(docsVectorIndex) + err := ingestData(index) if err != nil { panic(err) } @@ -49,7 +53,7 @@ func main() { break } - similarities, err := docsVectorIndex.Query(context.Background(), query, indexoption.WithTopK(3)) + similarities, err := index.Query(context.Background(), query, indexoption.WithTopK(3)) if err != nil { panic(err) } @@ -98,11 +102,11 @@ func main() { } -func ingestData(docsVectorIndex *simplevectorindex.Index) error { +func ingestData(index *index.Index) error { fmt.Printf("Learning Knowledge Base...") - loader := loader.NewPDFToTextLoader("./kb") + loader := loader.NewPDFToTextLoader("./kb").WithPDFToTextPath("/opt/homebrew/bin/pdftotext") documents, err := loader.Load(context.Background()) if err != nil { @@ -113,7 +117,7 @@ func ingestData(docsVectorIndex *simplevectorindex.Index) error { documentChunks := textSplitter.SplitDocuments(documents) - err = docsVectorIndex.LoadFromDocuments(context.Background(), documentChunks) + err = index.LoadFromDocuments(context.Background(), documentChunks) if err != nil { return err } diff --git a/examples/embeddings/pinecone/main.go b/examples/embeddings/pinecone/main.go index 2cd7fb15..6a51f12e 100644 --- a/examples/embeddings/pinecone/main.go +++ b/examples/embeddings/pinecone/main.go @@ -5,8 +5,9 @@ import ( "fmt" openaiembedder "github.com/henomis/lingoose/embedder/openai" + "github.com/henomis/lingoose/index" indexoption "github.com/henomis/lingoose/index/option" - pineconeindex "github.com/henomis/lingoose/index/pinecone" + pineconedb "github.com/henomis/lingoose/index/vectordb/pinecone" "github.com/henomis/lingoose/llm/openai" "github.com/henomis/lingoose/loader" "github.com/henomis/lingoose/prompt" @@ -17,37 +18,36 @@ import ( func main() { - openaiEmbedder := openaiembedder.New(openaiembedder.AdaEmbeddingV2) - - pineconeIndex := pineconeindex.New( - pineconeindex.Options{ - IndexName: "test", - Namespace: "test-namespace", - IncludeContent: true, - CreateIndex: &pineconeindex.CreateIndexOptions{ - Dimension: 1536, - Replicas: 1, - Metric: "cosine", - PodType: "p1.x1", + index := index.New( + pineconedb.New( + pineconedb.Options{ + IndexName: "test", + Namespace: "test-namespace", + CreateIndexOptions: &pineconedb.CreateIndexOptions{ + Dimension: 1536, + Replicas: 1, + Metric: "cosine", + PodType: "p1.x1", + }, }, - }, - openaiEmbedder, - ) + ), + openaiembedder.New(openaiembedder.AdaEmbeddingV2), + ).WithIncludeContents(true) - indexIsEmpty, err := pineconeIndex.IsEmpty(context.Background()) + indexIsEmpty, err := index.IsEmpty(context.Background()) if err != nil { panic(err) } if indexIsEmpty { - err = ingestData(pineconeIndex) + err = ingestData(index) if err != nil { panic(err) } } query := "What is the purpose of the NATO Alliance?" - similarities, err := pineconeIndex.Query( + similarities, err := index.Query( context.Background(), query, indexoption.WithTopK(3), @@ -88,7 +88,7 @@ func main() { } -func ingestData(pineconeIndex *pineconeindex.Index) error { +func ingestData(index *index.Index) error { documents, err := loader.NewDirectoryLoader(".", ".txt").Load(context.Background()) if err != nil { @@ -108,6 +108,6 @@ func ingestData(pineconeIndex *pineconeindex.Index) error { } - return pineconeIndex.LoadFromDocuments(context.Background(), documentChunks) + return index.LoadFromDocuments(context.Background(), documentChunks) } diff --git a/examples/embeddings/qdrant/main.go b/examples/embeddings/qdrant/main.go index 48b07da5..6c24ad26 100644 --- a/examples/embeddings/qdrant/main.go +++ b/examples/embeddings/qdrant/main.go @@ -5,8 +5,9 @@ import ( "fmt" openaiembedder "github.com/henomis/lingoose/embedder/openai" + "github.com/henomis/lingoose/index" indexoption "github.com/henomis/lingoose/index/option" - qdrantindex "github.com/henomis/lingoose/index/qdrant" + qdrantdb "github.com/henomis/lingoose/index/vectordb/qdrant" "github.com/henomis/lingoose/llm/openai" "github.com/henomis/lingoose/loader" "github.com/henomis/lingoose/prompt" @@ -18,34 +19,34 @@ import ( func main() { - openaiEmbedder := openaiembedder.New(openaiembedder.AdaEmbeddingV2) - - qdrantIndex := qdrantindex.New( - qdrantindex.Options{ - CollectionName: "test", - IncludeContent: true, - CreateCollection: &qdrantindex.CreateCollectionOptions{ - Dimension: 1536, - Distance: qdrantindex.DistanceCosine, + index := index.New( + qdrantdb.New( + qdrantdb.Options{ + CollectionName: "test", + IncludeContent: true, + CreateCollection: &qdrantdb.CreateCollectionOptions{ + Dimension: 1536, + Distance: qdrantdb.DistanceCosine, + }, }, - }, - openaiEmbedder, - ).WithAPIKeyAndEdpoint("", "http://localhost:6333") + ).WithAPIKeyAndEdpoint("", "http://localhost:6333"), + openaiembedder.New(openaiembedder.AdaEmbeddingV2), + ).WithIncludeContents(true) - indexIsEmpty, err := qdrantIndex.IsEmpty(context.Background()) + indexIsEmpty, err := index.IsEmpty(context.Background()) if err != nil { panic(err) } if indexIsEmpty { - err = ingestData(qdrantIndex) + err = ingestData(index) if err != nil { panic(err) } } query := "What is the purpose of the NATO Alliance?" - similarities, err := qdrantIndex.Query( + similarities, err := index.Query( context.Background(), query, indexoption.WithTopK(3), @@ -86,7 +87,7 @@ func main() { } -func ingestData(qdrantIndex *qdrantindex.Index) error { +func ingestData(qdrantIndex *index.Index) error { documents, err := loader.NewDirectoryLoader(".", ".txt").Load(context.Background()) if err != nil { diff --git a/examples/embeddings/simplekb/main.go b/examples/embeddings/simplekb/main.go index f87b7b32..28eeb135 100644 --- a/examples/embeddings/simplekb/main.go +++ b/examples/embeddings/simplekb/main.go @@ -4,8 +4,9 @@ import ( "context" openaiembedder "github.com/henomis/lingoose/embedder/openai" + "github.com/henomis/lingoose/index" "github.com/henomis/lingoose/index/option" - simplevectorindex "github.com/henomis/lingoose/index/simpleVectorIndex" + "github.com/henomis/lingoose/index/vectordb/jsondb" "github.com/henomis/lingoose/llm/openai" "github.com/henomis/lingoose/loader" qapipeline "github.com/henomis/lingoose/pipeline/qa" @@ -14,7 +15,7 @@ import ( func main() { docs, _ := loader.NewPDFToTextLoader("./kb").WithTextSplitter(textsplitter.NewRecursiveCharacterTextSplitter(2000, 200)).Load(context.Background()) - index := simplevectorindex.New("db", ".", openaiembedder.New(openaiembedder.AdaEmbeddingV2)) + index := index.New(jsondb.New("db.json"), openaiembedder.New(openaiembedder.AdaEmbeddingV2)).WithIncludeContents(true) index.LoadFromDocuments(context.Background(), docs) qapipeline.New(openai.NewChat().WithVerbose(true)).WithIndex(index).Query(context.Background(), "What is the NATO purpose?", option.WithTopK(1)) } diff --git a/examples/llm/cache/main.go b/examples/llm/cache/main.go index b6f6d475..33c2cf6c 100644 --- a/examples/llm/cache/main.go +++ b/examples/llm/cache/main.go @@ -8,7 +8,8 @@ import ( "strings" openaiembedder "github.com/henomis/lingoose/embedder/openai" - simplevectorindex "github.com/henomis/lingoose/index/simpleVectorIndex" + "github.com/henomis/lingoose/index" + "github.com/henomis/lingoose/index/vectordb/jsondb" "github.com/henomis/lingoose/llm/cache" "github.com/henomis/lingoose/llm/openai" ) @@ -16,7 +17,10 @@ import ( func main() { embedder := openaiembedder.New(openaiembedder.AdaEmbeddingV2) - index := simplevectorindex.New("db", ".", embedder) + index := index.New( + jsondb.New("db.json"), + embedder, + ) llm := openai.NewCompletion().WithCompletionCache(cache.New(embedder, index).WithTopK(3)) for { diff --git a/index/index.go b/index/index.go index 068e615c..e1e0b554 100644 --- a/index/index.go +++ b/index/index.go @@ -3,9 +3,12 @@ package index import ( "context" "errors" + "fmt" + "github.com/google/uuid" "github.com/henomis/lingoose/document" "github.com/henomis/lingoose/embedder" + "github.com/henomis/lingoose/index/option" "github.com/henomis/lingoose/types" ) @@ -14,8 +17,11 @@ var ( ) const ( - DefaultKeyID = "id" - DefaultKeyContent = "content" + DefaultKeyID = "id" + DefaultKeyContent = "content" + defaultBatchInsertSize = 32 + defaultTopK = 10 + defaultIncludeContent = true ) type Data struct { @@ -24,6 +30,144 @@ type Data struct { Metadata types.Meta } +type Embedder interface { + Embed(ctx context.Context, texts []string) ([]embedder.Embedding, error) +} + +type VectorDB interface { + Insert(context.Context, []Data) error + IsEmpty(context.Context) (bool, error) + Search(context.Context, []float64, *option.Options) (SearchResults, error) +} + +type Index struct { + vectorDB VectorDB + embedder Embedder + batchInsertSize int + includeContent bool +} + +func New(vectorDB VectorDB, embedder Embedder) *Index { + return &Index{ + vectorDB: vectorDB, + embedder: embedder, + batchInsertSize: defaultBatchInsertSize, + includeContent: defaultIncludeContent, + } +} + +func (i *Index) WithIncludeContents(includeContents bool) *Index { + i.includeContent = includeContents + return i +} + +func (i *Index) WithBatchInsertSize(batchInsertSize int) *Index { + i.batchInsertSize = batchInsertSize + return i +} + +func (i *Index) LoadFromDocuments(ctx context.Context, documents []document.Document) error { + err := i.batchUpsert(ctx, documents) + if err != nil { + return fmt.Errorf("%w: %w", ErrInternal, err) + } + return nil +} + +func (i *Index) Add(ctx context.Context, data *Data) error { + if data == nil { + return nil + } + return i.vectorDB.Insert(ctx, []Data{*data}) +} + +func (i *Index) IsEmpty(ctx context.Context) (bool, error) { + return i.vectorDB.IsEmpty(ctx) +} + +func (i *Index) Search(ctx context.Context, values []float64, opts ...option.Option) (SearchResults, error) { + options := &option.Options{ + TopK: defaultTopK, + } + + for _, opt := range opts { + opt(options) + } + return i.vectorDB.Search(ctx, values, options) +} + +func (i *Index) Query(ctx context.Context, query string, opts ...option.Option) (SearchResults, error) { + embeddings, err := i.embedder.Embed(ctx, []string{query}) + if err != nil { + return nil, err + } + return i.Search(ctx, embeddings[0], opts...) +} + +func (i *Index) batchUpsert(ctx context.Context, documents []document.Document) error { + for j := 0; j < len(documents); j += i.batchInsertSize { + batchEnd := j + i.batchInsertSize + if batchEnd > len(documents) { + batchEnd = len(documents) + } + + texts := []string{} + for _, document := range documents[j:batchEnd] { + texts = append(texts, document.Content) + } + + embeddings, err := i.embedder.Embed(ctx, texts) + if err != nil { + return err + } + + data, err := i.buildDataFromEmbeddingsAndDocuments(embeddings, documents, j) + if err != nil { + return err + } + + err = i.vectorDB.Insert(ctx, data) + if err != nil { + return err + } + } + + return nil +} + +func (i *Index) buildDataFromEmbeddingsAndDocuments( + embeddings []embedder.Embedding, + documents []document.Document, + startIndex int, +) ([]Data, error) { + var vectors []Data + + for j, embedding := range embeddings { + metadata := DeepCopyMetadata(documents[startIndex+j].Metadata) + + // inject document content into vector metadata + if i.includeContent { + metadata[DefaultKeyContent] = documents[startIndex+j].Content + } + + vectorID, err := uuid.NewUUID() + if err != nil { + return nil, err + } + + vectors = append(vectors, Data{ + ID: vectorID.String(), + Values: embedding, + Metadata: metadata, + }) + + // inject vector ID into document metadata + documents[startIndex+j].Metadata[DefaultKeyID] = vectorID.String() + } + + return vectors, nil +} + type SearchResult struct { Data Score float64 @@ -53,10 +197,6 @@ func (s SearchResults) ToDocuments() []document.Document { return documents } -type Embedder interface { - Embed(ctx context.Context, texts []string) ([]embedder.Embedding, error) -} - func DeepCopyMetadata(metadata types.Meta) types.Meta { metadataCopy := make(types.Meta) for k, v := range metadata { diff --git a/index/pinecone/pinecone.go b/index/pinecone/pinecone.go deleted file mode 100644 index 41405dd2..00000000 --- a/index/pinecone/pinecone.go +++ /dev/null @@ -1,431 +0,0 @@ -package pinecone - -import ( - "context" - "fmt" - "os" - "time" - - "github.com/google/uuid" - "github.com/henomis/lingoose/document" - "github.com/henomis/lingoose/embedder" - "github.com/henomis/lingoose/index" - "github.com/henomis/lingoose/index/option" - pineconego "github.com/henomis/pinecone-go" - pineconerequest "github.com/henomis/pinecone-go/request" - pineconeresponse "github.com/henomis/pinecone-go/response" -) - -const ( - defaultTopK = 10 - defaultBatchUpsertSize = 32 -) - -type Index struct { - pineconeClient *pineconego.PineconeGo - indexName string - projectID *string - namespace string - embedder index.Embedder - includeContent bool - includeValues bool - batchUpsertSize int - - createIndex *CreateIndexOptions -} - -type CreateIndexOptions struct { - Dimension int - Replicas int - Metric string - PodType string -} - -type Options struct { - IndexName string - Namespace string - IncludeContent bool - IncludeValues bool - BatchUpsertSize *int - CreateIndex *CreateIndexOptions -} - -func New(options Options, embedder index.Embedder) *Index { - apiKey := os.Getenv("PINECONE_API_KEY") - environment := os.Getenv("PINECONE_ENVIRONMENT") - - pineconeClient := pineconego.New(environment, apiKey) - - batchUpsertSize := defaultBatchUpsertSize - if options.BatchUpsertSize != nil { - batchUpsertSize = *options.BatchUpsertSize - } - - return &Index{ - pineconeClient: pineconeClient, - indexName: options.IndexName, - embedder: embedder, - namespace: options.Namespace, - includeContent: options.IncludeContent, - includeValues: options.IncludeValues, - batchUpsertSize: batchUpsertSize, - createIndex: options.CreateIndex, - } -} - -func (p *Index) WithAPIKeyAndEnvironment(apiKey, environment string) *Index { - p.pineconeClient = pineconego.New(environment, apiKey) - return p -} - -func (p *Index) LoadFromDocuments(ctx context.Context, documents []document.Document) error { - err := p.createIndexIfRequired(ctx) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - err = p.batchUpsert(ctx, documents) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - return nil -} - -func (p *Index) IsEmpty(ctx context.Context) (bool, error) { - err := p.createIndexIfRequired(ctx) - if err != nil { - return true, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - err = p.getProjectID(ctx) - if err != nil { - return true, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - req := &pineconerequest.VectorDescribeIndexStats{ - IndexName: p.indexName, - ProjectID: *p.projectID, - } - res := &pineconeresponse.VectorDescribeIndexStats{} - - err = p.pineconeClient.VectorDescribeIndexStats(ctx, req, res) - if err != nil { - return true, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - namespace, ok := res.Namespaces[p.namespace] - if !ok { - return true, nil - } - - if namespace.VectorCount == nil { - return false, fmt.Errorf("%w: failed to get total index size", index.ErrInternal) - } - - return *namespace.VectorCount == 0, nil -} - -func (p *Index) Add(ctx context.Context, item *index.Data) error { - err := p.createIndexIfRequired(ctx) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - if item.ID == "" { - id, errUUID := uuid.NewUUID() - if errUUID != nil { - return errUUID - } - item.ID = id.String() - } - - return p.vectorUpsert(ctx, - []pineconerequest.Vector{ - { - ID: item.ID, - Values: item.Values, - Metadata: item.Metadata, - }, - }, - ) -} - -func (p *Index) Search(ctx context.Context, values []float64, opts ...option.Option) (index.SearchResults, error) { - pineconeOptions := &option.Options{ - TopK: defaultTopK, - } - - for _, opt := range opts { - opt(pineconeOptions) - } - - if pineconeOptions.Filter == nil { - pineconeOptions.Filter = map[string]string{} - } - - matches, err := p.similaritySearch(ctx, values, pineconeOptions) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return buildSearchResultsFromPineconeMatches(matches, p.includeContent), nil -} - -func (p *Index) Query(ctx context.Context, query string, opts ...option.Option) (index.SearchResults, error) { - pineconeOptions := &option.Options{ - TopK: defaultTopK, - } - - for _, opt := range opts { - opt(pineconeOptions) - } - - if pineconeOptions.Filter == nil { - pineconeOptions.Filter = map[string]string{} - } - - matches, err := p.query(ctx, query, pineconeOptions) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return buildSearchResultsFromPineconeMatches(matches, p.includeContent), nil -} - -func (p *Index) query(ctx context.Context, query string, opts *option.Options) ([]pineconeresponse.QueryMatch, error) { - embeddings, err := p.embedder.Embed(ctx, []string{query}) - if err != nil { - return nil, err - } - - return p.similaritySearch(ctx, embeddings[0], opts) -} - -func (p *Index) similaritySearch( - ctx context.Context, - values []float64, - opts *option.Options, -) ([]pineconeresponse.QueryMatch, error) { - err := p.getProjectID(ctx) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - includeMetadata := true - res := &pineconeresponse.VectorQuery{} - err = p.pineconeClient.VectorQuery( - ctx, - &pineconerequest.VectorQuery{ - IndexName: p.indexName, - ProjectID: *p.projectID, - TopK: int32(opts.TopK), - Vector: values, - IncludeMetadata: &includeMetadata, - IncludeValues: &p.includeValues, - Namespace: &p.namespace, - Filter: opts.Filter.(map[string]string), - }, - res, - ) - if err != nil { - return nil, err - } - - return res.Matches, nil -} - -func (p *Index) getProjectID(ctx context.Context) error { - if p.projectID != nil { - return nil - } - - whoamiResp := &pineconeresponse.Whoami{} - - err := p.pineconeClient.Whoami(ctx, &pineconerequest.Whoami{}, whoamiResp) - if err != nil { - return err - } - - p.projectID = &whoamiResp.ProjectID - - return nil -} - -func (p *Index) createIndexIfRequired(ctx context.Context) error { - if p.createIndex == nil { - return nil - } - - resp := &pineconeresponse.IndexList{} - err := p.pineconeClient.IndexList(ctx, &pineconerequest.IndexList{}, resp) - if err != nil { - return err - } - - for _, index := range resp.Indexes { - if index == p.indexName { - return nil - } - } - - metric := pineconerequest.Metric(p.createIndex.Metric) - - req := &pineconerequest.IndexCreate{ - Name: p.indexName, - Dimension: p.createIndex.Dimension, - Replicas: &p.createIndex.Replicas, - Metric: &metric, - PodType: &p.createIndex.PodType, - } - - err = p.pineconeClient.IndexCreate(ctx, req, &pineconeresponse.IndexCreate{}) - if err != nil { - return err - } - - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - - describe := &pineconeresponse.IndexDescribe{} - err = p.pineconeClient.IndexDescribe(ctx, &pineconerequest.IndexDescribe{IndexName: p.indexName}, describe) - if err != nil { - return err - } - - if describe.Status.Ready { - return nil - } - - time.Sleep(1 * time.Second) - } - } -} - -func (p *Index) batchUpsert(ctx context.Context, documents []document.Document) error { - for i := 0; i < len(documents); i += p.batchUpsertSize { - batchEnd := i + p.batchUpsertSize - if batchEnd > len(documents) { - batchEnd = len(documents) - } - - texts := []string{} - for _, document := range documents[i:batchEnd] { - texts = append(texts, document.Content) - } - - embeddings, err := p.embedder.Embed(ctx, texts) - if err != nil { - return err - } - - vectors, err := buildPineconeVectorsFromEmbeddingsAndDocuments(embeddings, documents, i, p.includeContent) - if err != nil { - return err - } - - err = p.vectorUpsert(ctx, vectors) - if err != nil { - return err - } - } - - return nil -} - -func (p *Index) vectorUpsert(ctx context.Context, vectors []pineconerequest.Vector) error { - err := p.getProjectID(ctx) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - req := &pineconerequest.VectorUpsert{ - IndexName: p.indexName, - ProjectID: *p.projectID, - Vectors: vectors, - Namespace: p.namespace, - } - res := &pineconeresponse.VectorUpsert{} - - err = p.pineconeClient.VectorUpsert(ctx, req, res) - if err != nil { - return err - } - - if res.UpsertedCount == nil || res.UpsertedCount != nil && *res.UpsertedCount != int64(len(vectors)) { - return fmt.Errorf("error upserting embeddings") - } - - return nil -} - -func buildPineconeVectorsFromEmbeddingsAndDocuments( - embeddings []embedder.Embedding, - documents []document.Document, - startIndex int, - includeContent bool, -) ([]pineconerequest.Vector, error) { - var vectors []pineconerequest.Vector - - for i, embedding := range embeddings { - metadata := index.DeepCopyMetadata(documents[startIndex+i].Metadata) - - // inject document content into vector metadata - if includeContent { - metadata[index.DefaultKeyContent] = documents[startIndex+i].Content - } - - vectorID, err := uuid.NewUUID() - if err != nil { - return nil, err - } - - vectors = append(vectors, pineconerequest.Vector{ - ID: vectorID.String(), - Values: embedding, - Metadata: metadata, - }) - - // inject vector ID into document metadata - documents[startIndex+i].Metadata[index.DefaultKeyID] = vectorID.String() - } - - return vectors, nil -} - -func buildSearchResultsFromPineconeMatches( - matches []pineconeresponse.QueryMatch, - includeContent bool, -) index.SearchResults { - searchResults := make([]index.SearchResult, len(matches)) - - for i, match := range matches { - metadata := index.DeepCopyMetadata(match.Metadata) - if !includeContent { - delete(metadata, index.DefaultKeyContent) - } - - id := "" - if match.ID != nil { - id = *match.ID - } - - score := float64(0) - if match.Score != nil { - score = *match.Score - } - - searchResults[i] = index.SearchResult{ - Data: index.Data{ - ID: id, - Metadata: metadata, - Values: match.Values, - }, - Score: score, - } - } - - return searchResults -} diff --git a/index/qdrant/qdrant.go b/index/qdrant/qdrant.go deleted file mode 100644 index 81ccf6fe..00000000 --- a/index/qdrant/qdrant.go +++ /dev/null @@ -1,358 +0,0 @@ -package qdrant - -import ( - "context" - "fmt" - "os" - - "github.com/google/uuid" - "github.com/henomis/lingoose/document" - "github.com/henomis/lingoose/embedder" - "github.com/henomis/lingoose/index" - "github.com/henomis/lingoose/index/option" - qdrantgo "github.com/henomis/qdrant-go" - qdrantrequest "github.com/henomis/qdrant-go/request" - qdrantresponse "github.com/henomis/qdrant-go/response" -) - -const ( - defaultTopK = 10 - defaultBatchUpsertSize = 32 -) - -type Index struct { - qdrantClient *qdrantgo.Client - collectionName string - embedder index.Embedder - includeContent bool - includeValues bool - batchUpsertSize int - - createCollection *CreateCollectionOptions -} - -type Distance string - -const ( - DistanceCosine Distance = Distance(qdrantrequest.DistanceCosine) - DistanceEuclidean Distance = Distance(qdrantrequest.DistanceEuclidean) - DistanceDot Distance = Distance(qdrantrequest.DistanceDot) -) - -type CreateCollectionOptions struct { - Dimension uint64 - Distance Distance - OnDisk bool -} - -type Options struct { - CollectionName string - IncludeContent bool - IncludeValues bool - BatchUpsertSize *int - CreateCollection *CreateCollectionOptions -} - -func New(options Options, embedder index.Embedder) *Index { - apiKey := os.Getenv("QDRANT_API_KEY") - endpoint := os.Getenv("QDRANT_ENDPOINT") - - qdrantClient := qdrantgo.New(endpoint, apiKey) - - batchUpsertSize := defaultBatchUpsertSize - if options.BatchUpsertSize != nil { - batchUpsertSize = *options.BatchUpsertSize - } - - return &Index{ - qdrantClient: qdrantClient, - collectionName: options.CollectionName, - embedder: embedder, - includeContent: options.IncludeContent, - includeValues: options.IncludeValues, - batchUpsertSize: batchUpsertSize, - createCollection: options.CreateCollection, - } -} - -func (q *Index) WithAPIKeyAndEdpoint(apiKey, endpoint string) *Index { - q.qdrantClient = qdrantgo.New(endpoint, apiKey) - return q -} - -func (q *Index) LoadFromDocuments(ctx context.Context, documents []document.Document) error { - err := q.createCollectionIfRequired(ctx) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - err = q.batchUpsert(ctx, documents) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - return nil -} - -func (q *Index) IsEmpty(ctx context.Context) (bool, error) { - err := q.createCollectionIfRequired(ctx) - if err != nil { - return true, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - res := &qdrantresponse.CollectionCollectInfo{} - err = q.qdrantClient.CollectionCollectInfo( - ctx, - &qdrantrequest.CollectionCollectInfo{ - CollectionName: q.collectionName, - }, - res, - ) - if err != nil { - return true, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return res.Result.VectorsCount == 0, nil -} - -func (q *Index) Add(ctx context.Context, item *index.Data) error { - err := q.createCollectionIfRequired(ctx) - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - if item.ID == "" { - id, errUUID := uuid.NewUUID() - if errUUID != nil { - return errUUID - } - item.ID = id.String() - } - - return q.pointUpsert(ctx, - []qdrantrequest.Point{ - { - ID: item.ID, - Vector: item.Values, - Payload: item.Metadata, - }, - }, - ) -} - -func (q *Index) Search(ctx context.Context, values []float64, opts ...option.Option) (index.SearchResults, error) { - qdrantOptions := &option.Options{ - TopK: defaultTopK, - } - - for _, opt := range opts { - opt(qdrantOptions) - } - - matches, err := q.similaritySearch(ctx, values, qdrantOptions) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return buildSearchResultsFromQdrantMatches(matches, q.includeContent), nil -} - -func (q *Index) Query(ctx context.Context, query string, opts ...option.Option) (index.SearchResults, error) { - qdrantOptions := &option.Options{ - TopK: defaultTopK, - } - - for _, opt := range opts { - opt(qdrantOptions) - } - - matches, err := q.query(ctx, query, qdrantOptions) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return buildSearchResultsFromQdrantMatches(matches, q.includeContent), nil -} - -func (q *Index) similaritySearch( - ctx context.Context, - values []float64, - opts *option.Options, -) ([]qdrantresponse.PointSearchResult, error) { - if opts.Filter == nil { - opts.Filter = qdrantrequest.Filter{} - } - - includeMetadata := true - res := &qdrantresponse.PointSearch{} - err := q.qdrantClient.PointSearch( - ctx, - &qdrantrequest.PointSearch{ - CollectionName: q.collectionName, - Limit: opts.TopK, - Vector: values, - WithPayload: &includeMetadata, - WithVector: &q.includeValues, - Filter: opts.Filter.(qdrantrequest.Filter), - }, - res, - ) - if err != nil { - return nil, err - } - - return res.Result, nil -} - -func (q *Index) query( - ctx context.Context, - query string, - opts *option.Options, -) ([]qdrantresponse.PointSearchResult, error) { - embeddings, err := q.embedder.Embed(ctx, []string{query}) - if err != nil { - return nil, err - } - - return q.similaritySearch(ctx, embeddings[0], opts) -} - -func (q *Index) createCollectionIfRequired(ctx context.Context) error { - if q.createCollection == nil { - return nil - } - - resp := &qdrantresponse.CollectionList{} - err := q.qdrantClient.CollectionList(ctx, &qdrantrequest.CollectionList{}, resp) - if err != nil { - return err - } - - for _, collection := range resp.Result.Collections { - if collection.Name == q.collectionName { - return nil - } - } - - req := &qdrantrequest.CollectionCreate{ - CollectionName: q.collectionName, - Vectors: qdrantrequest.VectorsParams{ - Size: q.createCollection.Dimension, - Distance: qdrantrequest.Distance(q.createCollection.Distance), - OnDisk: &q.createCollection.OnDisk, - }, - } - - err = q.qdrantClient.CollectionCreate(ctx, req, &qdrantresponse.CollectionCreate{}) - if err != nil { - return err - } - - return nil -} - -func (q *Index) batchUpsert(ctx context.Context, documents []document.Document) error { - for i := 0; i < len(documents); i += q.batchUpsertSize { - batchEnd := i + q.batchUpsertSize - if batchEnd > len(documents) { - batchEnd = len(documents) - } - - texts := []string{} - for _, document := range documents[i:batchEnd] { - texts = append(texts, document.Content) - } - - embeddings, err := q.embedder.Embed(ctx, texts) - if err != nil { - return err - } - - points, err := buildQdrantPointsFromEmbeddingsAndDocuments(embeddings, documents, i, q.includeContent) - if err != nil { - return err - } - - err = q.pointUpsert(ctx, points) - if err != nil { - return err - } - } - - return nil -} - -func (q *Index) pointUpsert(ctx context.Context, points []qdrantrequest.Point) error { - wait := true - req := &qdrantrequest.PointUpsert{ - Wait: &wait, - CollectionName: q.collectionName, - Points: points, - } - res := &qdrantresponse.PointUpsert{} - - err := q.qdrantClient.PointUpsert(ctx, req, res) - if err != nil { - return err - } - - return nil -} - -func buildQdrantPointsFromEmbeddingsAndDocuments( - embeddings []embedder.Embedding, - documents []document.Document, - startIndex int, - includeContent bool, -) ([]qdrantrequest.Point, error) { - var vectors []qdrantrequest.Point - - for i, embedding := range embeddings { - metadata := index.DeepCopyMetadata(documents[startIndex+i].Metadata) - - // inject document content into vector metadata - if includeContent { - metadata[index.DefaultKeyContent] = documents[startIndex+i].Content - } - - vectorID, err := uuid.NewUUID() - if err != nil { - return nil, err - } - - vectors = append(vectors, qdrantrequest.Point{ - ID: vectorID.String(), - Vector: embedding, - Payload: metadata, - }) - - // inject vector ID into document metadata - documents[startIndex+i].Metadata[index.DefaultKeyID] = vectorID.String() - } - - return vectors, nil -} - -func buildSearchResultsFromQdrantMatches( - matches []qdrantresponse.PointSearchResult, - includeContent bool, -) index.SearchResults { - searchResults := make([]index.SearchResult, len(matches)) - - for i, match := range matches { - metadata := index.DeepCopyMetadata(match.Payload) - if !includeContent { - delete(metadata, index.DefaultKeyContent) - } - - searchResults[i] = index.SearchResult{ - Data: index.Data{ - ID: match.ID, - Metadata: metadata, - Values: match.Vector, - }, - Score: match.Score, - } - } - - return searchResults -} diff --git a/index/simpleVectorIndex/simpleVectorIndex.go b/index/simpleVectorIndex/simpleVectorIndex.go deleted file mode 100644 index 203104b9..00000000 --- a/index/simpleVectorIndex/simpleVectorIndex.go +++ /dev/null @@ -1,298 +0,0 @@ -package simplevectorindex - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "math" - "os" - "sort" - "strings" - - "github.com/google/uuid" - "github.com/henomis/lingoose/document" - "github.com/henomis/lingoose/embedder" - "github.com/henomis/lingoose/index" - "github.com/henomis/lingoose/index/option" - "github.com/henomis/lingoose/types" -) - -const ( - defaultBatchSize = 32 - defaultTopK = 10 -) - -type data struct { - ID string `json:"id"` - Metadata types.Meta `json:"metadata"` - Values []float64 `json:"values"` -} - -type Index struct { - data []data - outputPath string - name string - embedder index.Embedder -} - -type FilterFn func([]index.SearchResult) []index.SearchResult - -func New(name string, outputPath string, embedder index.Embedder) *Index { - simpleVectorIndex := &Index{ - data: []data{}, - outputPath: outputPath, - name: name, - embedder: embedder, - } - - return simpleVectorIndex -} - -func (s *Index) LoadFromDocuments(ctx context.Context, documents []document.Document) error { - err := s.load() - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - for i := 0; i < len(documents); i += defaultBatchSize { - end := i + defaultBatchSize - if end > len(documents) { - end = len(documents) - } - - texts := []string{} - for _, document := range documents[i:end] { - texts = append(texts, document.Content) - } - - embeddings, errEmbed := s.embedder.Embed(ctx, texts) - if errEmbed != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, errEmbed) - } - - for j, document := range documents[i:end] { - id, errUUID := uuid.NewUUID() - if errUUID != nil { - return errUUID - } - s.data = append(s.data, buildDataFromEmbeddingAndDocument(id.String(), embeddings[j], document)) - } - } - - err = s.save() - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return nil -} - -func buildDataFromEmbeddingAndDocument( - id string, - embedding embedder.Embedding, - document document.Document, -) data { - metadata := index.DeepCopyMetadata(document.Metadata) - metadata[index.DefaultKeyContent] = document.Content - return data{ - ID: id, - Values: embedding, - Metadata: metadata, - } -} - -func (s Index) save() error { - jsonContent, err := json.Marshal(s.data) - if err != nil { - return err - } - - return os.WriteFile(s.database(), jsonContent, 0600) -} - -func (s *Index) load() error { - if len(s.data) > 0 { - return nil - } - - if _, err := os.Stat(s.database()); os.IsNotExist(err) { - return s.save() - } - - content, err := os.ReadFile(s.database()) - if err != nil { - return err - } - - return json.Unmarshal(content, &s.data) -} - -func (s *Index) database() string { - return strings.Join([]string{s.outputPath, s.name + ".json"}, string(os.PathSeparator)) -} - -func (s *Index) IsEmpty() (bool, error) { - err := s.load() - if err != nil { - return true, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return len(s.data) == 0, nil -} - -func (s *Index) Add(ctx context.Context, item *index.Data) error { - _ = ctx - err := s.load() - if err != nil { - return fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - if item.ID == "" { - id, errUUID := uuid.NewUUID() - if errUUID != nil { - return errUUID - } - item.ID = id.String() - } - - s.data = append( - s.data, - data{ - ID: item.ID, - Values: item.Values, - Metadata: item.Metadata, - }, - ) - - return s.save() -} - -func (s *Index) Search(ctx context.Context, values []float64, opts ...option.Option) (index.SearchResults, error) { - sviOptions := &option.Options{ - TopK: defaultTopK, - } - - for _, opt := range opts { - opt(sviOptions) - } - - err := s.load() - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return s.similaritySearch(ctx, values, sviOptions) -} - -func (s *Index) Query(ctx context.Context, query string, opts ...option.Option) (index.SearchResults, error) { - sviOptions := &option.Options{ - TopK: defaultTopK, - } - - for _, opt := range opts { - opt(sviOptions) - } - - err := s.load() - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - embeddings, err := s.embedder.Embed(ctx, []string{query}) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - return s.similaritySearch(ctx, embeddings[0], sviOptions) -} - -func (s *Index) similaritySearch( - ctx context.Context, - embedding embedder.Embedding, - opts *option.Options, -) (index.SearchResults, error) { - _ = ctx - scores, err := s.cosineSimilarityBatch(embedding) - if err != nil { - return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) - } - - searchResults := make([]index.SearchResult, len(scores)) - - for i, score := range scores { - searchResults[i] = index.SearchResult{ - Data: index.Data{ - ID: s.data[i].ID, - Values: s.data[i].Values, - Metadata: s.data[i].Metadata, - }, - Score: score, - } - } - - if opts.Filter != nil { - searchResults = opts.Filter.(FilterFn)(searchResults) - } - - return filterSearchResults(searchResults, opts.TopK), nil -} - -func (s *Index) cosineSimilarity(a []float64, b []float64) (cosine float64, err error) { - var count int - lengthA := len(a) - lengthB := len(b) - if lengthA > lengthB { - count = lengthA - } else { - count = lengthB - } - sumA := 0.0 - s1 := 0.0 - s2 := 0.0 - for k := 0; k < count; k++ { - if k >= lengthA { - s2 += math.Pow(b[k], 2) - continue - } - if k >= lengthB { - s1 += math.Pow(a[k], 2) - continue - } - sumA += a[k] * b[k] - s1 += math.Pow(a[k], 2) - s2 += math.Pow(b[k], 2) - } - if s1 == 0 || s2 == 0 { - return 0.0, errors.New("vectors should not be null (all zeros)") - } - return sumA / (math.Sqrt(s1) * math.Sqrt(s2)), nil -} - -func (s *Index) cosineSimilarityBatch(a embedder.Embedding) ([]float64, error) { - var err error - scores := make([]float64, len(s.data)) - - for i := range s.data { - scores[i], err = s.cosineSimilarity(a, s.data[i].Values) - if err != nil { - return nil, err - } - } - - return scores, nil -} - -func filterSearchResults(searchResults index.SearchResults, topK int) index.SearchResults { - //sort by similarity score - sort.Slice(searchResults, func(i, j int) bool { - return (1 - searchResults[i].Score) < (1 - searchResults[j].Score) - }) - - maxTopK := topK - if maxTopK > len(searchResults) { - maxTopK = len(searchResults) - } - - return searchResults[:maxTopK] -} diff --git a/index/vectordb/jsondb/jsondb.go b/index/vectordb/jsondb/jsondb.go new file mode 100644 index 00000000..9a10466d --- /dev/null +++ b/index/vectordb/jsondb/jsondb.go @@ -0,0 +1,203 @@ +package jsondb + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "math" + "os" + "sort" + + "github.com/google/uuid" + "github.com/henomis/lingoose/embedder" + "github.com/henomis/lingoose/index" + "github.com/henomis/lingoose/index/option" + "github.com/henomis/lingoose/types" +) + +type data struct { + ID string `json:"id"` + Metadata types.Meta `json:"metadata"` + Values []float64 `json:"values"` +} + +type Index struct { + data []data + dbPath string +} + +type FilterFn func([]index.SearchResult) []index.SearchResult + +func New(dbPath string) *Index { + index := &Index{ + data: []data{}, + dbPath: dbPath, + } + + return index +} + +func (i Index) save() error { + jsonContent, err := json.Marshal(i.data) + if err != nil { + return err + } + + return os.WriteFile(i.dbPath, jsonContent, 0600) +} + +func (i *Index) load() error { + if len(i.data) > 0 { + return nil + } + + if _, err := os.Stat(i.dbPath); os.IsNotExist(err) { + return i.save() + } + + content, err := os.ReadFile(i.dbPath) + if err != nil { + return err + } + + return json.Unmarshal(content, &i.data) +} + +func (i *Index) IsEmpty(_ context.Context) (bool, error) { + err := i.load() + if err != nil { + return true, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + return len(i.data) == 0, nil +} + +func (i *Index) Insert(ctx context.Context, datas []index.Data) error { + _ = ctx + err := i.load() + if err != nil { + return fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + var records []data + for _, item := range datas { + if item.ID == "" { + id, errUUID := uuid.NewUUID() + if errUUID != nil { + return errUUID + } + item.ID = id.String() + } + + point := data{ + ID: item.ID, + Values: item.Values, + Metadata: item.Metadata, + } + records = append(records, point) + } + + i.data = append(i.data, records...) + + return i.save() +} + +func (i *Index) Search(ctx context.Context, values []float64, options *option.Options) (index.SearchResults, error) { + err := i.load() + if err != nil { + return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + return i.similaritySearch(ctx, values, options) +} + +func (i *Index) similaritySearch( + ctx context.Context, + embedding embedder.Embedding, + opts *option.Options, +) (index.SearchResults, error) { + _ = ctx + scores, err := i.cosineSimilarityBatch(embedding) + if err != nil { + return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + searchResults := make([]index.SearchResult, len(scores)) + + for j, score := range scores { + searchResults[j] = index.SearchResult{ + Data: index.Data{ + ID: i.data[j].ID, + Values: i.data[j].Values, + Metadata: i.data[j].Metadata, + }, + Score: score, + } + } + + if opts.Filter != nil { + searchResults = opts.Filter.(FilterFn)(searchResults) + } + + return filterSearchResults(searchResults, opts.TopK), nil +} + +func (i *Index) cosineSimilarity(a []float64, b []float64) (cosine float64, err error) { + var count int + lengthA := len(a) + lengthB := len(b) + if lengthA > lengthB { + count = lengthA + } else { + count = lengthB + } + sumA := 0.0 + s1 := 0.0 + s2 := 0.0 + for k := 0; k < count; k++ { + if k >= lengthA { + s2 += math.Pow(b[k], 2) + continue + } + if k >= lengthB { + s1 += math.Pow(a[k], 2) + continue + } + sumA += a[k] * b[k] + s1 += math.Pow(a[k], 2) + s2 += math.Pow(b[k], 2) + } + if s1 == 0 || s2 == 0 { + return 0.0, errors.New("vectors should not be null (all zeros)") + } + return sumA / (math.Sqrt(s1) * math.Sqrt(s2)), nil +} + +func (i *Index) cosineSimilarityBatch(a embedder.Embedding) ([]float64, error) { + var err error + scores := make([]float64, len(i.data)) + + for j := range i.data { + scores[j], err = i.cosineSimilarity(a, i.data[j].Values) + if err != nil { + return nil, err + } + } + + return scores, nil +} + +func filterSearchResults(searchResults index.SearchResults, topK int) index.SearchResults { + //sort by similarity score + sort.Slice(searchResults, func(i, j int) bool { + return (1 - searchResults[i].Score) < (1 - searchResults[j].Score) + }) + + maxTopK := topK + if maxTopK > len(searchResults) { + maxTopK = len(searchResults) + } + + return searchResults[:maxTopK] +} diff --git a/index/vectordb/pinecone/pinecone.go b/index/vectordb/pinecone/pinecone.go new file mode 100644 index 00000000..06ddbd79 --- /dev/null +++ b/index/vectordb/pinecone/pinecone.go @@ -0,0 +1,282 @@ +package pinecone + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/google/uuid" + "github.com/henomis/lingoose/index" + "github.com/henomis/lingoose/index/option" + pineconego "github.com/henomis/pinecone-go" + pineconegorequest "github.com/henomis/pinecone-go/request" + pineconegoresponse "github.com/henomis/pinecone-go/response" +) + +type DB struct { + pineconeClient *pineconego.PineconeGo + indexName string + projectID *string + namespace string + + createIndexOptions *CreateIndexOptions +} + +type CreateIndexOptions struct { + Dimension int + Replicas int + Metric string + PodType string +} + +type Options struct { + IndexName string + Namespace string + CreateIndexOptions *CreateIndexOptions +} + +func New(options Options) *DB { + apiKey := os.Getenv("PINECONE_API_KEY") + environment := os.Getenv("PINECONE_ENVIRONMENT") + + pineconeClient := pineconego.New(environment, apiKey) + + return &DB{ + pineconeClient: pineconeClient, + indexName: options.IndexName, + namespace: options.Namespace, + createIndexOptions: options.CreateIndexOptions, + } +} + +func (d *DB) WithAPIKeyAndEnvironment(apiKey, environment string) *DB { + d.pineconeClient = pineconego.New(environment, apiKey) + return d +} + +func (d *DB) IsEmpty(ctx context.Context) (bool, error) { + err := d.createIndexIfRequired(ctx) + if err != nil { + return true, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + err = d.getProjectID(ctx) + if err != nil { + return true, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + req := &pineconegorequest.VectorDescribeIndexStats{ + IndexName: d.indexName, + ProjectID: *d.projectID, + } + res := &pineconegoresponse.VectorDescribeIndexStats{} + + err = d.pineconeClient.VectorDescribeIndexStats(ctx, req, res) + if err != nil { + return true, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + namespace, ok := res.Namespaces[d.namespace] + if !ok { + return true, nil + } + + if namespace.VectorCount == nil { + return false, fmt.Errorf("%w: failed to get total index size", index.ErrInternal) + } + + return *namespace.VectorCount == 0, nil +} + +func (d *DB) Search(ctx context.Context, values []float64, options *option.Options) (index.SearchResults, error) { + if options.Filter == nil { + options.Filter = map[string]string{} + } + + matches, err := d.similaritySearch(ctx, values, options) + if err != nil { + return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + return buildSearchResultsFromPineconeMatches(matches), nil +} + +func (d *DB) similaritySearch( + ctx context.Context, + values []float64, + opts *option.Options, +) ([]pineconegoresponse.QueryMatch, error) { + err := d.getProjectID(ctx) + if err != nil { + return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + includeMetadata := true + includeValues := true + res := &pineconegoresponse.VectorQuery{} + err = d.pineconeClient.VectorQuery( + ctx, + &pineconegorequest.VectorQuery{ + IndexName: d.indexName, + ProjectID: *d.projectID, + TopK: int32(opts.TopK), + Vector: values, + IncludeMetadata: &includeMetadata, + IncludeValues: &includeValues, + Namespace: &d.namespace, + Filter: opts.Filter.(map[string]string), + }, + res, + ) + if err != nil { + return nil, err + } + + return res.Matches, nil +} + +func (d *DB) getProjectID(ctx context.Context) error { + if d.projectID != nil { + return nil + } + + whoamiResp := &pineconegoresponse.Whoami{} + + err := d.pineconeClient.Whoami(ctx, &pineconegorequest.Whoami{}, whoamiResp) + if err != nil { + return err + } + + d.projectID = &whoamiResp.ProjectID + + return nil +} + +func (d *DB) createIndexIfRequired(ctx context.Context) error { + if d.createIndexOptions == nil { + return nil + } + + resp := &pineconegoresponse.IndexList{} + err := d.pineconeClient.IndexList(ctx, &pineconegorequest.IndexList{}, resp) + if err != nil { + return err + } + + for _, index := range resp.Indexes { + if index == d.indexName { + return nil + } + } + + metric := pineconegorequest.Metric(d.createIndexOptions.Metric) + + req := &pineconegorequest.IndexCreate{ + Name: d.indexName, + Dimension: d.createIndexOptions.Dimension, + Replicas: &d.createIndexOptions.Replicas, + Metric: &metric, + PodType: &d.createIndexOptions.PodType, + } + + err = d.pineconeClient.IndexCreate(ctx, req, &pineconegoresponse.IndexCreate{}) + if err != nil { + return err + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + + describe := &pineconegoresponse.IndexDescribe{} + err = d.pineconeClient.IndexDescribe(ctx, &pineconegorequest.IndexDescribe{IndexName: d.indexName}, describe) + if err != nil { + return err + } + + if describe.Status.Ready { + return nil + } + + time.Sleep(1 * time.Second) + } + } +} + +func (d *DB) Insert(ctx context.Context, datas []index.Data) error { + err := d.getProjectID(ctx) + if err != nil { + return fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + vectors := []pineconegorequest.Vector{} + for _, data := range datas { + if data.ID == "" { + id, errUUID := uuid.NewUUID() + if errUUID != nil { + return errUUID + } + data.ID = id.String() + } + + vector := pineconegorequest.Vector{ + ID: data.ID, + Values: data.Values, + Metadata: data.Metadata, + } + vectors = append(vectors, vector) + } + + req := &pineconegorequest.VectorUpsert{ + IndexName: d.indexName, + ProjectID: *d.projectID, + Vectors: vectors, + Namespace: d.namespace, + } + res := &pineconegoresponse.VectorUpsert{} + + err = d.pineconeClient.VectorUpsert(ctx, req, res) + if err != nil { + return err + } + + if res.UpsertedCount == nil || res.UpsertedCount != nil && *res.UpsertedCount != int64(len(vectors)) { + return fmt.Errorf("error upserting embeddings") + } + + return nil +} + +func buildSearchResultsFromPineconeMatches( + matches []pineconegoresponse.QueryMatch, +) index.SearchResults { + searchResults := make([]index.SearchResult, len(matches)) + + for i, match := range matches { + metadata := index.DeepCopyMetadata(match.Metadata) + + id := "" + if match.ID != nil { + id = *match.ID + } + + score := float64(0) + if match.Score != nil { + score = *match.Score + } + + searchResults[i] = index.SearchResult{ + Data: index.Data{ + ID: id, + Metadata: metadata, + Values: match.Values, + }, + Score: score, + } + } + + return searchResults +} diff --git a/index/vectordb/qdrant/qdrant.go b/index/vectordb/qdrant/qdrant.go new file mode 100644 index 00000000..1882e843 --- /dev/null +++ b/index/vectordb/qdrant/qdrant.go @@ -0,0 +1,219 @@ +package qdrant + +import ( + "context" + "fmt" + "os" + + "github.com/google/uuid" + "github.com/henomis/lingoose/index" + "github.com/henomis/lingoose/index/option" + qdrantgo "github.com/henomis/qdrant-go" + qdrantrequest "github.com/henomis/qdrant-go/request" + qdrantresponse "github.com/henomis/qdrant-go/response" +) + +type DB struct { + qdrantClient *qdrantgo.Client + collectionName string + includeContent bool + includeValues bool + + createCollection *CreateCollectionOptions +} + +type Distance string + +const ( + DistanceCosine Distance = Distance(qdrantrequest.DistanceCosine) + DistanceEuclidean Distance = Distance(qdrantrequest.DistanceEuclidean) + DistanceDot Distance = Distance(qdrantrequest.DistanceDot) +) + +type CreateCollectionOptions struct { + Dimension uint64 + Distance Distance + OnDisk bool +} + +type Options struct { + CollectionName string + IncludeContent bool + IncludeValues bool + BatchUpsertSize *int + CreateCollection *CreateCollectionOptions +} + +func New(options Options) *DB { + apiKey := os.Getenv("QDRANT_API_KEY") + endpoint := os.Getenv("QDRANT_ENDPOINT") + + qdrantClient := qdrantgo.New(endpoint, apiKey) + + return &DB{ + qdrantClient: qdrantClient, + collectionName: options.CollectionName, + includeContent: options.IncludeContent, + includeValues: options.IncludeValues, + createCollection: options.CreateCollection, + } +} + +func (d *DB) WithAPIKeyAndEdpoint(apiKey, endpoint string) *DB { + d.qdrantClient = qdrantgo.New(endpoint, apiKey) + return d +} + +func (d *DB) IsEmpty(ctx context.Context) (bool, error) { + err := d.createCollectionIfRequired(ctx) + if err != nil { + return true, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + res := &qdrantresponse.CollectionCollectInfo{} + err = d.qdrantClient.CollectionCollectInfo( + ctx, + &qdrantrequest.CollectionCollectInfo{ + CollectionName: d.collectionName, + }, + res, + ) + if err != nil { + return true, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + return res.Result.VectorsCount == 0, nil +} + +func (d *DB) Insert(ctx context.Context, datas []index.Data) error { + err := d.createCollectionIfRequired(ctx) + if err != nil { + return fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + var points []qdrantrequest.Point + for _, data := range datas { + if data.ID == "" { + id, errUUID := uuid.NewUUID() + if errUUID != nil { + return errUUID + } + data.ID = id.String() + } + + point := qdrantrequest.Point{ + ID: data.ID, + Vector: data.Values, + Payload: data.Metadata, + } + points = append(points, point) + } + + wait := true + req := &qdrantrequest.PointUpsert{ + Wait: &wait, + CollectionName: d.collectionName, + Points: points, + } + res := &qdrantresponse.PointUpsert{} + + return d.qdrantClient.PointUpsert(ctx, req, res) +} + +func (d *DB) Search(ctx context.Context, values []float64, options *option.Options) (index.SearchResults, error) { + matches, err := d.similaritySearch(ctx, values, options) + if err != nil { + return nil, fmt.Errorf("%w: %w", index.ErrInternal, err) + } + + return buildSearchResultsFromQdrantMatches(matches, d.includeContent), nil +} + +func (d *DB) similaritySearch( + ctx context.Context, + values []float64, + opts *option.Options, +) ([]qdrantresponse.PointSearchResult, error) { + if opts.Filter == nil { + opts.Filter = qdrantrequest.Filter{} + } + + includeMetadata := true + res := &qdrantresponse.PointSearch{} + err := d.qdrantClient.PointSearch( + ctx, + &qdrantrequest.PointSearch{ + CollectionName: d.collectionName, + Limit: opts.TopK, + Vector: values, + WithPayload: &includeMetadata, + WithVector: &d.includeValues, + Filter: opts.Filter.(qdrantrequest.Filter), + }, + res, + ) + if err != nil { + return nil, err + } + + return res.Result, nil +} + +func (d *DB) createCollectionIfRequired(ctx context.Context) error { + if d.createCollection == nil { + return nil + } + + resp := &qdrantresponse.CollectionList{} + err := d.qdrantClient.CollectionList(ctx, &qdrantrequest.CollectionList{}, resp) + if err != nil { + return err + } + + for _, collection := range resp.Result.Collections { + if collection.Name == d.collectionName { + return nil + } + } + + req := &qdrantrequest.CollectionCreate{ + CollectionName: d.collectionName, + Vectors: qdrantrequest.VectorsParams{ + Size: d.createCollection.Dimension, + Distance: qdrantrequest.Distance(d.createCollection.Distance), + OnDisk: &d.createCollection.OnDisk, + }, + } + + err = d.qdrantClient.CollectionCreate(ctx, req, &qdrantresponse.CollectionCreate{}) + if err != nil { + return err + } + + return nil +} + +func buildSearchResultsFromQdrantMatches( + matches []qdrantresponse.PointSearchResult, + includeContent bool, +) index.SearchResults { + searchResults := make([]index.SearchResult, len(matches)) + + for i, match := range matches { + metadata := index.DeepCopyMetadata(match.Payload) + if !includeContent { + delete(metadata, index.DefaultKeyContent) + } + + searchResults[i] = index.SearchResult{ + Data: index.Data{ + ID: match.ID, + Metadata: metadata, + Values: match.Vector, + }, + Score: match.Score, + } + } + + return searchResults +} diff --git a/llm/cache/cache.go b/llm/cache/cache.go index 6776ff34..bfc297a3 100644 --- a/llm/cache/cache.go +++ b/llm/cache/cache.go @@ -77,7 +77,7 @@ func (c *Cache) Get(ctx context.Context, query string) (*Result, error) { }, nil } - return nil, ErrCacheMiss + return &Result{Embedding: embedding[0]}, ErrCacheMiss } func (c *Cache) Set(ctx context.Context, embedding []float64, answer string) error { diff --git a/loader/pdf_to_text.go b/loader/pdf_to_text.go index b9cfc259..8a92f8a3 100644 --- a/loader/pdf_to_text.go +++ b/loader/pdf_to_text.go @@ -14,7 +14,7 @@ import ( var ( ErrPdfToTextNotFound = fmt.Errorf("pdftotext not found") - defaultPdfToTextPath = "/usr/bin/pdftotext" + defaultPdfToTextPath = "pdftotext" ) type PDFLoader struct { @@ -42,11 +42,6 @@ func (p *PDFLoader) WithTextSplitter(textSplitter TextSplitter) *PDFLoader { } func (p *PDFLoader) Load(ctx context.Context) ([]document.Document, error) { - _, err := os.Stat(p.pdfToTextPath) - if err != nil { - return nil, ErrPdfToTextNotFound - } - fileInfo, err := os.Stat(p.path) if err != nil { return nil, err