Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

documentloaders: ms office docs #1068

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 228 additions & 0 deletions documentloaders/ms_office.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
package documentloaders

import (
"archive/zip"
"bytes"
"context"
"fmt"
"io"
"path/filepath"
"strings"

"github.com/richardlehane/mscfb"
"github.com/tealeg/xlsx"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/textsplitter"
)

var _ Loader = Office{}

// Office loads text data from an io.Reader.
type Office struct {
reader io.ReaderAt
size int64
fileType string
}

// NewOffice creates a new text loader with an io.Reader, filename and file size.
func NewOffice(reader io.ReaderAt, filename string, size int64) Office {
return Office{
reader: reader,
size: size,
fileType: strings.ToLower(filepath.Ext(filename)),
}
}

// Load reads from the io.Reader for the MS Office data and returns the raw document data.
// nolint
func (loader Office) Load(ctx context.Context) ([]schema.Document, error) {
switch loader.fileType {
case ".doc":
return loader.loadDoc()
case ".docx":
return loader.loadDocx()
case ".xls", ".xlsx":
return loader.loadExcel()
case ".ppt":
// parsing for old PPTs is same as for old DOCs
return loader.loadDoc()
case ".pptx":
return loader.loadPPTX()
default:
return nil, fmt.Errorf("unsupported file type: %s", loader.fileType)
}
}

// LoadAndSplit reads from the io.Reader for the MS Office data and returns the raw document data
// and splits it into multiple documents using a text splitter.
func (loader Office) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) {
docs, err := loader.Load(ctx)
if err != nil {
return nil, err
}

return textsplitter.SplitDocuments(splitter, docs)
}

func (loader Office) loadDoc() ([]schema.Document, error) {
doc, err := mscfb.New(io.NewSectionReader(loader.reader, 0, loader.size))
if err != nil {
return nil, fmt.Errorf("failed to read DOC file: %w", err)
}

var text strings.Builder
for entry, err := doc.Next(); err == nil; entry, err = doc.Next() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can an error be returned by doc.Next() because of an other reason then there being no more entries?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Umm that's a good question. I'll have to check and get back to you with that. Will submit fix next week ;)

// nolint
if entry.Name == "WordDocument" {
buf := make([]byte, entry.Size)
i, err := doc.Read(buf)
if err != nil {
return nil, fmt.Errorf("error reading WordDocument stream: %w", err)
}
if i > 0 {
// Process the binary content
for j := 0; j < i; j++ {
// Extract readable ASCII text
if buf[j] >= 32 && buf[j] <= 126 {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these numbers be unexported constants

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it!

text.WriteByte(buf[j])
} else if buf[j] == 13 || buf[j] == 10 {
text.WriteByte('\n')
}
}
}
}
}

documents := []schema.Document{
{
PageContent: text.String(),
Metadata: map[string]interface{}{
"fileType": loader.fileType,
},
},
}

return documents, nil
}

func (loader Office) loadExcel() ([]schema.Document, error) {
buf := bytes.NewBuffer(make([]byte, 0, loader.size))
if _, err := io.Copy(buf, io.NewSectionReader(loader.reader, 0, loader.size)); err != nil {
return nil, fmt.Errorf("failed to copy Excel content: %w", err)
}

xlFile, err := xlsx.OpenBinary(buf.Bytes())
if err != nil {
return nil, fmt.Errorf("failed to read Excel file: %w", err)
}

docs := make([]schema.Document, 0, len(xlFile.Sheets))
for i, sheet := range xlFile.Sheets {
var text strings.Builder
for _, row := range sheet.Rows {
for _, cell := range row.Cells {
text.WriteString(cell.String() + "\t")
}
text.WriteString("\n")
}

docs = append(docs, schema.Document{
PageContent: text.String(),
Metadata: map[string]interface{}{
"fileType": loader.fileType,
"sheetName": sheet.Name,
"sheetIndex": i,
},
})
}

return docs, nil
}

func (loader Office) loadPPTX() ([]schema.Document, error) {
buf := bytes.NewBuffer(make([]byte, 0, loader.size))
if _, err := io.Copy(buf, io.NewSectionReader(loader.reader, 0, loader.size)); err != nil {
return nil, fmt.Errorf("failed to copy content: %w", err)
}

zipReader, err := zip.NewReader(bytes.NewReader(buf.Bytes()), loader.size)
if err != nil {
return nil, fmt.Errorf("failed to read PPTX file as ZIP: %w", err)
}

var text strings.Builder
for _, file := range zipReader.File {
// PPTX stores slide content in ppt/slides/slide*.xml files
if strings.HasPrefix(file.Name, "ppt/slides/slide") && strings.HasSuffix(file.Name, ".xml") {
rc, err := file.Open()
if err != nil {
return nil, fmt.Errorf("error opening slide XML: %w", err)
}
defer rc.Close()

content, err := io.ReadAll(rc)
if err != nil {
return nil, fmt.Errorf("error reading content: %w", err)
}

content = bytes.ReplaceAll(content, []byte("<"), []byte(" <"))
content = bytes.ReplaceAll(content, []byte(">"), []byte("> "))
text.Write(content)
text.WriteString("\n--- Next Slide ---\n")
}
}

documents := []schema.Document{
{
PageContent: text.String(),
Metadata: map[string]interface{}{
"fileType": loader.fileType,
},
},
}

return documents, nil
}

func (loader Office) loadDocx() ([]schema.Document, error) {
buf := bytes.NewBuffer(make([]byte, 0, loader.size))
if _, err := io.Copy(buf, io.NewSectionReader(loader.reader, 0, loader.size)); err != nil {
return nil, fmt.Errorf("failed to copy content: %w", err)
}

zipReader, err := zip.NewReader(bytes.NewReader(buf.Bytes()), loader.size)
if err != nil {
return nil, fmt.Errorf("failed to read DOCX file as ZIP: %w", err)
}

var text strings.Builder
for _, file := range zipReader.File {
if file.Name == "word/document.xml" {
rc, err := file.Open()
if err != nil {
return nil, fmt.Errorf("error opening document.xml: %w", err)
}
defer rc.Close()

content, err := io.ReadAll(rc)
if err != nil {
return nil, fmt.Errorf("error reading content: %w", err)
}

content = bytes.ReplaceAll(content, []byte("<"), []byte(" <"))
content = bytes.ReplaceAll(content, []byte(">"), []byte("> "))
text.Write(content)
}
}

documents := []schema.Document{
{
PageContent: text.String(),
Metadata: map[string]interface{}{
"fileType": loader.fileType,
},
},
}

return documents, nil
}
92 changes: 92 additions & 0 deletions documentloaders/ms_office_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package documentloaders

import (
"context"
"os"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestMSOfficeLoader(test *testing.T) {
test.Parallel()

docExpectedContent := "This is a .doc test file."
docxExpectedContent := "This is a .docx test file."
xlsxExpectedContent := "This is an .xlsx test file"
pptxExpectedContent := "This is a .pptx test file"

test.Run("Load .doc", func(t *testing.T) {
t.Parallel()

file, err := os.Open("./testdata/test.doc")
require.NoError(t, err)
defer file.Close()

fileInfo, err := file.Stat()
require.NoError(t, err)

loader := NewOffice(file, fileInfo.Name(), fileInfo.Size())
docs, err := loader.Load(context.Background())
require.NoError(t, err)

assert.Len(t, docs, 1)
assert.True(t, strings.Contains(docs[0].PageContent, docExpectedContent))
})

test.Run("Load .docx", func(t *testing.T) {
t.Parallel()

file, err := os.Open("./testdata/test.docx")
require.NoError(t, err)
defer file.Close()

fileInfo, err := file.Stat()
require.NoError(t, err)

loader := NewOffice(file, fileInfo.Name(), fileInfo.Size())
docs, err := loader.Load(context.Background())
require.NoError(t, err)

assert.Len(t, docs, 1)
assert.True(t, strings.Contains(docs[0].PageContent, docxExpectedContent))
})

test.Run("Load .xlsx", func(t *testing.T) {
t.Parallel()

file, err := os.Open("./testdata/test.xlsx")
require.NoError(t, err)
defer file.Close()

fileInfo, err := file.Stat()
require.NoError(t, err)

loader := NewOffice(file, fileInfo.Name(), fileInfo.Size())
docs, err := loader.Load(context.Background())
require.NoError(t, err)

assert.Len(t, docs, 1)
assert.True(t, strings.Contains(docs[0].PageContent, xlsxExpectedContent))
})

test.Run("Load .pptx", func(t *testing.T) {
t.Parallel()

file, err := os.Open("./testdata/test.pptx")
require.NoError(t, err)
defer file.Close()

fileInfo, err := file.Stat()
require.NoError(t, err)

loader := NewOffice(file, fileInfo.Name(), fileInfo.Size())
docs, err := loader.Load(context.Background())
require.NoError(t, err)

assert.Len(t, docs, 1)
assert.True(t, strings.Contains(docs[0].PageContent, pptxExpectedContent))
})
}
Binary file added documentloaders/testdata/test.doc
Binary file not shown.
Binary file added documentloaders/testdata/test.docx
Binary file not shown.
Binary file added documentloaders/testdata/test.pptx
Binary file not shown.
Binary file added documentloaders/testdata/test.xlsx
Binary file not shown.
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ require (
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/richardlehane/msoleps v1.0.1 // indirect
github.com/rogpeppe/go-internal v1.11.0 // indirect
github.com/rs/zerolog v1.31.0 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
Expand All @@ -158,7 +159,6 @@ require (
gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect
gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect
gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect
Expand Down Expand Up @@ -216,10 +216,13 @@ require (
github.com/pinecone-io/go-pinecone v0.4.1
github.com/pkoukk/tiktoken-go v0.1.6
github.com/redis/rueidis v1.0.34
github.com/richardlehane/mscfb v1.0.4
github.com/tealeg/xlsx v1.0.5
github.com/weaviate/weaviate v1.24.1
github.com/weaviate/weaviate-go-client/v4 v4.13.1
gitlab.com/golang-commonmark/markdown v0.0.0-20211110145824-bf3e522c626a
go.mongodb.org/mongo-driver v1.14.0
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1
go.starlark.net v0.0.0-20230302034142-4b1e35fe2254
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
golang.org/x/tools v0.14.0
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,10 @@ github.com/qdrant/go-client v1.7.0 h1:2TeeWyZAWIup7vvD7Ne6aAvo0H+F5OUb1pB9Z8Y4pF
github.com/qdrant/go-client v1.7.0/go.mod h1:680gkxNAsVtre0Z8hAQmtPzJtz1xFAyCu2TUxULtnoE=
github.com/redis/rueidis v1.0.34 h1:cdggTaDDoqLNeoKMoew8NQY3eTc83Kt6XyfXtoCO2Wc=
github.com/redis/rueidis v1.0.34/go.mod h1:g8nPmgR4C68N3abFiOc/gUOSEKw3Tom6/teYMehg4RE=
github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM=
github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk=
github.com/richardlehane/msoleps v1.0.1 h1:RfrALnSNXzmXLbGct/P2b4xkFz4e8Gmj/0Vj9M9xC1o=
github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
Expand Down Expand Up @@ -668,6 +672,8 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tealeg/xlsx v1.0.5 h1:+f8oFmvY8Gw1iUXzPk+kz+4GpbDZPK1FhPiQRd+ypgE=
github.com/tealeg/xlsx v1.0.5/go.mod h1:btRS8dz54TDnvKNosuAqxrM1QgN1udgk9O34bDCnORM=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/testcontainers/testcontainers-go v0.31.0 h1:W0VwIhcEVhRflwL9as3dhY6jXjVCA27AkmbnZ+UTh3U=
Expand Down