Skip to content

Commit

Permalink
Chore Add subdocument rag (#173)
Browse files Browse the repository at this point in the history
  • Loading branch information
henomis authored Feb 28, 2024
1 parent 02d056b commit 63d5e3a
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 5 deletions.
File renamed without changes.
54 changes: 54 additions & 0 deletions examples/rag/subdocument/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package main

import (
"context"
"fmt"
"os"

"github.com/henomis/lingoose/assistant"
openaiembedder "github.com/henomis/lingoose/embedder/openai"
"github.com/henomis/lingoose/index"
"github.com/henomis/lingoose/index/vectordb/jsondb"
"github.com/henomis/lingoose/llm/openai"
"github.com/henomis/lingoose/rag"
"github.com/henomis/lingoose/thread"
)

// download https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt

func main() {
r := rag.NewSubDocumentRAG(
index.New(
jsondb.New().WithPersist("db.json"),
openaiembedder.New(openaiembedder.AdaEmbeddingV2),
),
openai.New(),
).WithTopK(3)

_, err := os.Stat("db.json")
if os.IsNotExist(err) {
err = r.AddSources(context.Background(), "state_of_the_union.txt")
if err != nil {
panic(err)
}
}

a := assistant.New(
openai.New().WithTemperature(0),
).WithRAG(r).WithThread(
thread.New().AddMessages(
thread.NewUserMessage().AddContent(
thread.NewTextContent("what is the purpose of NATO?"),
),
),
)

err = a.Run(context.Background())
if err != nil {
panic(err)
}

fmt.Println("----")
fmt.Println(a.Thread())
fmt.Println("----")
}
5 changes: 0 additions & 5 deletions rag/rag.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@ type RAG struct {
loaders map[*regexp.Regexp]Loader // this map a regexp as string to a loader
}

type Fusion struct {
RAG
llm LLM
}

func New(index *index.Index) *RAG {
rag := &RAG{
index: index,
Expand Down
5 changes: 5 additions & 0 deletions rag/rag_fusion.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ var ragFusionPrompts = []string{
"OUTPUT (4 queries):",
}

type Fusion struct {
RAG
llm LLM
}

func NewFusion(index *index.Index, llm LLM) *Fusion {
return &Fusion{
RAG: *New(index),
Expand Down
121 changes: 121 additions & 0 deletions rag/sub_document.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package rag

import (
"context"
"regexp"

"github.com/henomis/lingoose/document"
"github.com/henomis/lingoose/index"
"github.com/henomis/lingoose/textsplitter"
"github.com/henomis/lingoose/thread"
"github.com/henomis/lingoose/types"
)

const (
defaultSubDocumentRAGChunkSize = 8192
defaultSubDocumentRAGChunkOverlap = 0
defaultSubDocumentRAGChildChunkSize = 512
)

type SubDocumentRAG struct {
RAG
childChunkSize uint
llm LLM
}

//nolint:lll
var SubDocumentRAGSummarizePrompt = "Please give a concise summary of the context in 1-2 sentences.\n\nContext: {{.context}}"

func NewSubDocumentRAG(index *index.Index, llm LLM) *SubDocumentRAG {
return &SubDocumentRAG{
RAG: *New(index).
WithChunkSize(defaultSubDocumentRAGChunkSize).
WithChunkOverlap(defaultSubDocumentRAGChunkOverlap),
childChunkSize: defaultSubDocumentRAGChildChunkSize,
llm: llm,
}
}

func (r *SubDocumentRAG) WithChunkSize(chunkSize uint) *SubDocumentRAG {
r.chunkSize = chunkSize
return r
}

func (r *SubDocumentRAG) WithChildChunkSize(childChunkSize uint) *SubDocumentRAG {
r.childChunkSize = childChunkSize
return r
}

func (r *SubDocumentRAG) WithChunkOverlap(chunkOverlap uint) *SubDocumentRAG {
r.chunkOverlap = chunkOverlap
return r
}

func (r *SubDocumentRAG) WithTopK(topK uint) *SubDocumentRAG {
r.topK = topK
return r
}

func (r *SubDocumentRAG) WithLoader(sourceRegexp *regexp.Regexp, loader Loader) *SubDocumentRAG {
r.loaders[sourceRegexp] = loader
return r
}

func (r *SubDocumentRAG) AddSources(ctx context.Context, sources ...string) error {
for _, source := range sources {
documents, err := r.addSource(ctx, source)
if err != nil {
return err
}

subDocuments, err := r.generateSubDocuments(ctx, documents)
if err != nil {
return err
}

err = r.index.LoadFromDocuments(ctx, subDocuments)
if err != nil {
return err
}
}

return nil
}

func (r *SubDocumentRAG) generateSubDocuments(
ctx context.Context,
documents []document.Document,
) ([]document.Document, error) {
var subDocuments []document.Document

for _, doc := range documents {
t := thread.New().AddMessages(
thread.NewUserMessage().AddContent(
thread.NewTextContent(SubDocumentRAGSummarizePrompt).Format(
types.M{
"context": doc.Content,
},
),
),
)

err := r.llm.Generate(ctx, t)
if err != nil {
return nil, err
}
summary := t.LastMessage().Contents[0].AsString()

subChunks := textsplitter.NewRecursiveCharacterTextSplitter(
int(r.childChunkSize),
0,
).SplitDocuments([]document.Document{doc})

for i := range subChunks {
subChunks[i].Content = summary + "\n" + subChunks[i].Content
}

subDocuments = append(subDocuments, subChunks...)
}

return subDocuments, nil
}

0 comments on commit 63d5e3a

Please sign in to comment.