From 35d79507b7d223a91cf210f5c046cb45bb3208f5 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Tue, 20 Aug 2024 17:34:18 +0200 Subject: [PATCH 1/2] add: blueprint flows file --- Makefile | 2 +- pkg/cmd/askdir.go | 2 +- pkg/cmd/client.go | 9 ++--- pkg/cmd/ingest.go | 8 +++-- pkg/cmd/retrieve.go | 5 +-- pkg/flows/config/blueprints.go | 21 ++++++++++++ pkg/flows/config/blueprints/default.yaml | 42 ++++++++++++++++++++++++ pkg/flows/config/config.go | 23 ++++++++++++- 8 files changed, 100 insertions(+), 12 deletions(-) create mode 100644 pkg/flows/config/blueprints.go create mode 100644 pkg/flows/config/blueprints/default.yaml diff --git a/Makefile b/Makefile index 410059e..0d7978f 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ ifeq ($(GIT_TAG),) GIT_TAG := $(shell git describe --always) endif -GO_TAGS := netgo +GO_TAGS := netgo mupdf LD_FLAGS := -s -w -X github.com/gptscript-ai/knowledge/version.Version=${GIT_TAG} build: go build -mod=mod -o bin/knowledge -tags "${GO_TAGS}" -ldflags '$(LD_FLAGS) ' . diff --git a/pkg/cmd/askdir.go b/pkg/cmd/askdir.go index 7dca8d4..c06bba3 100644 --- a/pkg/cmd/askdir.go +++ b/pkg/cmd/askdir.go @@ -66,7 +66,7 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error { datasetID := client.HashPath(abspath) slog.Debug("Loading ingestion flows from config", "flows_file", s.FlowsFile, "dataset", datasetID) - flowCfg, err := flowconfig.FromFile(s.FlowsFile) + flowCfg, err := flowconfig.Load(s.FlowsFile) if err != nil { return err } diff --git a/pkg/cmd/client.go b/pkg/cmd/client.go index 6bfc7f2..c2683b4 100644 --- a/pkg/cmd/client.go +++ b/pkg/cmd/client.go @@ -3,14 +3,15 @@ package cmd import ( "archive/zip" "fmt" + "io" + "os" + "path/filepath" + "github.com/gptscript-ai/knowledge/pkg/client" "github.com/gptscript-ai/knowledge/pkg/config" "github.com/gptscript-ai/knowledge/pkg/datastore" "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings" "github.com/gptscript-ai/knowledge/pkg/datastore/types" - "io" - "os" - "path/filepath" ) type Client struct { @@ -25,7 +26,7 @@ type Client struct { } type ClientFlowsConfig struct { - FlowsFile string `usage:"Path to a YAML/JSON file containing ingestion/retrieval flows" env:"KNOW_FLOWS_FILE"` + FlowsFile string `usage:"Path to a YAML/JSON file containing ingestion/retrieval flows" env:"KNOW_FLOWS_FILE" default:"blueprint:default"` Flow string `usage:"Flow name" env:"KNOW_FLOW"` } diff --git a/pkg/cmd/ingest.go b/pkg/cmd/ingest.go index 0b6c4fe..a5a9b3f 100644 --- a/pkg/cmd/ingest.go +++ b/pkg/cmd/ingest.go @@ -2,11 +2,12 @@ package cmd import ( "fmt" - "github.com/acorn-io/z" - "github.com/spf13/cobra" "log/slog" "strings" + "github.com/acorn-io/z" + "github.com/spf13/cobra" + "github.com/gptscript-ai/knowledge/pkg/client" "github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter" flowconfig "github.com/gptscript-ai/knowledge/pkg/flows/config" @@ -68,7 +69,8 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error { if s.FlowsFile != "" { slog.Debug("Loading ingestion flows from config", "flows_file", s.FlowsFile, "dataset", datasetID) - flowCfg, err := flowconfig.FromFile(s.FlowsFile) + + flowCfg, err := flowconfig.Load(s.FlowsFile) if err != nil { return err } diff --git a/pkg/cmd/retrieve.go b/pkg/cmd/retrieve.go index 0943667..238fb97 100644 --- a/pkg/cmd/retrieve.go +++ b/pkg/cmd/retrieve.go @@ -4,11 +4,12 @@ import ( "encoding/json" "errors" "fmt" + "log/slog" + "github.com/gptscript-ai/knowledge/pkg/datastore" flowconfig "github.com/gptscript-ai/knowledge/pkg/flows/config" vserr "github.com/gptscript-ai/knowledge/pkg/vectorstore/errors" "github.com/spf13/cobra" - "log/slog" ) type ClientRetrieve struct { @@ -49,7 +50,7 @@ func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error { if s.FlowsFile != "" { slog.Debug("Loading retrieval flows from config", "flows_file", s.FlowsFile, "dataset", datasetIDs) - flowCfg, err := flowconfig.FromFile(s.FlowsFile) + flowCfg, err := flowconfig.Load(s.FlowsFile) if err != nil { return err } diff --git a/pkg/flows/config/blueprints.go b/pkg/flows/config/blueprints.go new file mode 100644 index 0000000..0c7462c --- /dev/null +++ b/pkg/flows/config/blueprints.go @@ -0,0 +1,21 @@ +package config + +import ( + _ "embed" + "fmt" +) + +//go:embed blueprints/default.yaml +var BlueprintDefault []byte + +var Blueprints = map[string][]byte{ + "default": BlueprintDefault, +} + +func GetBlueprint(name string) ([]byte, error) { + bp, ok := Blueprints[name] + if !ok { + return nil, fmt.Errorf("blueprint %q not found", name) + } + return bp, nil +} diff --git a/pkg/flows/config/blueprints/default.yaml b/pkg/flows/config/blueprints/default.yaml new file mode 100644 index 0000000..f94ad78 --- /dev/null +++ b/pkg/flows/config/blueprints/default.yaml @@ -0,0 +1,42 @@ +flows: + knowledge: + default: true + globals: + ingestion: + textsplitter: + chunkSize: 800 + chunkOverlap: 400 + ingestion: + - filetypes: ["*"] + retrieval: + querymodifiers: + # Enhance + - name: enhance + options: + model: + openai: + apiKey: "${OPENAI_API_KEY}" + model: gpt-4o + apiType: OPEN_AI + baseURL: https://api.openai.com/v1 + retriever: + name: subquery + options: + limit: 3 # max. 3 subqueries + topK: 10 # topK per search + model: + openai: + apiKey: "${OPENAI_API_KEY}" + model: gpt-4o + apiType: OPEN_AI + baseURL: https://api.openai.com/v1 + postprocessors: + - name: similarity + options: + threshold: 0.6 + - name: reduce + options: + topK: 10 + + + diff --git a/pkg/flows/config/config.go b/pkg/flows/config/config.go index 8fb3657..2d4fa03 100644 --- a/pkg/flows/config/config.go +++ b/pkg/flows/config/config.go @@ -3,11 +3,12 @@ package config import ( "encoding/json" "fmt" - "github.com/gptscript-ai/knowledge/pkg/output" "log/slog" "os" "strings" + "github.com/gptscript-ai/knowledge/pkg/output" + "github.com/gptscript-ai/knowledge/pkg/datastore/documentloader" "github.com/gptscript-ai/knowledge/pkg/datastore/postprocessors" "github.com/gptscript-ai/knowledge/pkg/datastore/querymodifiers" @@ -82,16 +83,36 @@ type TransformerConfig struct { GenericBaseConfig } +func FromBlueprint(name string) (*FlowConfig, error) { + bp, err := GetBlueprint(name) + if err != nil { + return nil, err + } + return FromBytes(bp) +} + +func Load(reference string) (*FlowConfig, error) { + if strings.HasPrefix(reference, "blueprint:") { + return FromBlueprint(strings.TrimPrefix(reference, "blueprint:")) + } + return FromFile(reference) +} + // FromFile reads a configuration file and returns a FlowConfig. func FromFile(filename string) (*FlowConfig, error) { content, err := os.ReadFile(filename) if err != nil { return nil, err } + return FromBytes(content) +} +func FromBytes(content []byte) (*FlowConfig, error) { // Expand environment variables in config content = []byte(os.ExpandEnv(string(content))) + var err error + var config FlowConfig jsondata := content if !json.Valid(content) { From 0c27f1324ddd7d60206bc932832470d9e84cd3a7 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Tue, 20 Aug 2024 17:38:50 +0200 Subject: [PATCH 2/2] change: no default flows-file --- pkg/cmd/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cmd/client.go b/pkg/cmd/client.go index c2683b4..d2dd645 100644 --- a/pkg/cmd/client.go +++ b/pkg/cmd/client.go @@ -26,7 +26,7 @@ type Client struct { } type ClientFlowsConfig struct { - FlowsFile string `usage:"Path to a YAML/JSON file containing ingestion/retrieval flows" env:"KNOW_FLOWS_FILE" default:"blueprint:default"` + FlowsFile string `usage:"Path to a YAML/JSON file containing ingestion/retrieval flows" env:"KNOW_FLOWS_FILE" default:""` Flow string `usage:"Flow name" env:"KNOW_FLOW"` }