Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
fix: do not error on unsupported file types by default upon ingestion…
Browse files Browse the repository at this point in the history
… - but add --err-on-unsupported-file flag to toggle that behavior
  • Loading branch information
iwilltry42 committed Aug 21, 2024
1 parent 0903a12 commit 4e55670
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 34 deletions.
22 changes: 12 additions & 10 deletions pkg/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package client

import (
"context"

"github.com/gptscript-ai/knowledge/pkg/datastore"
"github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter"
dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"
Expand All @@ -11,16 +12,17 @@ import (
)

type IngestPathsOpts struct {
IgnoreExtensions []string
Concurrency int
Recursive bool
TextSplitterOpts *textsplitter.TextSplitterOpts
IngestionFlows []flows.IngestionFlow
IgnoreFile string
IncludeHidden bool
NoCreateDataset bool
IsDuplicateFuncName string
Prune bool // Prune deleted files
IgnoreExtensions []string
Concurrency int
Recursive bool
TextSplitterOpts *textsplitter.TextSplitterOpts
IngestionFlows []flows.IngestionFlow
IgnoreFile string
IncludeHidden bool
NoCreateDataset bool
IsDuplicateFuncName string
Prune bool // Prune deleted files
ErrOnUnsupportedFile bool
}

type Client interface {
Expand Down
9 changes: 5 additions & 4 deletions pkg/client/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@ import (
"crypto/sha1"
"encoding/hex"
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"

"github.com/go-git/go-git/v5/plumbing/format/gitignore"
"github.com/gptscript-ai/knowledge/pkg/datastore"
remotes "github.com/gptscript-ai/knowledge/pkg/datastore/documentloader/remote"
dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"
"github.com/gptscript-ai/knowledge/pkg/index"
"golang.org/x/sync/errgroup"
"golang.org/x/sync/semaphore"
"log/slog"
"os"
"path/filepath"
"strings"
)

func isIgnored(ignore gitignore.Matcher, path string) bool {
Expand Down
10 changes: 9 additions & 1 deletion pkg/client/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@ package client

import (
"context"
"errors"
"fmt"
dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"
"os"
"path/filepath"

"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader"
dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"

"github.com/acorn-io/z"
"github.com/gptscript-ai/knowledge/pkg/datastore"
"github.com/gptscript-ai/knowledge/pkg/index"
Expand Down Expand Up @@ -100,6 +103,11 @@ func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, op
}

_, err = c.Ingest(ctx, datasetID, file, iopts)

if err != nil && !opts.ErrOnUnsupportedFile && errors.Is(err, &documentloader.UnsupportedFileTypeError{}) {
err = nil
}

return err
}

Expand Down
15 changes: 8 additions & 7 deletions pkg/cmd/askdir.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
query := args[0]

ingestOpts := &client.IngestPathsOpts{
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Recursive: !s.NoRecursive,
IgnoreFile: s.IgnoreFile,
IncludeHidden: s.IncludeHidden,
IsDuplicateFuncName: s.DeduplicationFuncName,
Prune: !s.NoPrune,
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Recursive: !s.NoRecursive,
IgnoreFile: s.IgnoreFile,
IncludeHidden: s.IncludeHidden,
IsDuplicateFuncName: s.DeduplicationFuncName,
Prune: !s.NoPrune,
ErrOnUnsupportedFile: s.ErrOnUnsupportedFile,
}

retrieveOpts := &datastore.RetrieveOpts{
Expand Down
28 changes: 20 additions & 8 deletions pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package cmd
import (
"fmt"
"log/slog"
"os"
"strings"

"github.com/acorn-io/z"
Expand Down Expand Up @@ -30,6 +31,7 @@ type ClientIngestOpts struct {
NoRecursive bool `usage:"Don't recursively ingest directories" default:"false" env:"KNOW_NO_INGEST_RECURSIVE"`
NoCreateDataset bool `usage:"Do NOT create the dataset if it doesn't exist" default:"true" env:"KNOW_INGEST_NO_CREATE_DATASET"`
DeduplicationFuncName string `usage:"Name of the deduplication function to use" name:"dedupe-func" env:"KNOW_INGEST_DEDUPE_FUNC"`
ErrOnUnsupportedFile bool `usage:"Error on unsupported file types" default:"false" env:"KNOW_INGEST_ERR_ON_UNSUPPORTED_FILE"`
}

func (s *ClientIngest) Customize(cmd *cobra.Command) {
Expand All @@ -56,15 +58,25 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
datasetID := s.Dataset
filePath := args[0]

finfo, err := os.Stat(filePath)
if err != nil {
return err
}
if !finfo.IsDir() {
slog.Debug("ingesting single file, setting err-on-unsupported-file to true", "file", filePath)
s.ErrOnUnsupportedFile = true
}

ingestOpts := &client.IngestPathsOpts{
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Recursive: !s.NoRecursive,
TextSplitterOpts: &s.TextSplitterOpts,
IgnoreFile: s.IgnoreFile,
IncludeHidden: s.IncludeHidden,
IsDuplicateFuncName: s.DeduplicationFuncName,
Prune: s.Prune,
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Recursive: !s.NoRecursive,
TextSplitterOpts: &s.TextSplitterOpts,
IgnoreFile: s.IgnoreFile,
IncludeHidden: s.IncludeHidden,
IsDuplicateFuncName: s.DeduplicationFuncName,
Prune: s.Prune,
ErrOnUnsupportedFile: s.ErrOnUnsupportedFile,
}

if s.FlowsFile != "" {
Expand Down
19 changes: 17 additions & 2 deletions pkg/datastore/documentloader/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,21 @@ import (
lcgodocloaders "github.com/tmc/langchaingo/documentloaders"
)

// UnsupportedFileTypeError is returned when a file type is not supported
type UnsupportedFileTypeError struct {
FileType string
}

func (e *UnsupportedFileTypeError) Error() string {
return fmt.Sprintf("unsupported file type %q", e.FileType)
}

func (e *UnsupportedFileTypeError) Is(err error) bool {
var unsupportedFileTypeError *UnsupportedFileTypeError
ok := errors.As(err, &unsupportedFileTypeError)
return ok
}

func DefaultDocLoaderFunc(filetype string) func(ctx context.Context, reader io.Reader) ([]vs.Document, error) {
switch filetype {
case ".pdf", "application/pdf":
Expand Down Expand Up @@ -131,7 +146,7 @@ func DefaultDocLoaderFunc(filetype string) func(ctx context.Context, reader io.R
dlf := DefaultDocLoaderFunc(ft)
if dlf == nil {
slog.Error("Unsupported file type in ZIP", "type", ft, "filename", f.Name)
return nil, fmt.Errorf("unsupported file type %q (file %q) in ZIP", f.Name, ft)
return nil, fmt.Errorf("%w (file %q) in ZIP", &UnsupportedFileTypeError{ft}, f.Name)
}
docs, err := dlf(ctx, bytes.NewReader(content))
if err != nil {
Expand Down Expand Up @@ -184,7 +199,7 @@ func DefaultDocLoaderFunc(filetype string) func(ctx context.Context, reader io.R
}

default:
slog.Error("Unsupported file type", "type", filetype)
slog.Debug("Unsupported file type", "type", filetype)
return nil
}
}
6 changes: 4 additions & 2 deletions pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ import (
"bytes"
"context"
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings"
"log/slog"

"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings"

"github.com/acorn-io/z"
"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/datastore/filetypes"
Expand Down Expand Up @@ -142,7 +144,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
}

if ingestionFlow.Load == nil {
return nil, fmt.Errorf("unsupported filetype %q (file %q)", filetype, opts.FileMetadata.AbsolutePath)
return nil, fmt.Errorf("%w (file %q)", &documentloader.UnsupportedFileTypeError{FileType: filetype}, opts.FileMetadata.AbsolutePath)
}

// Mandatory Transformation: Add filename to metadata
Expand Down

0 comments on commit 4e55670

Please sign in to comment.