Skip to content

Commit

Permalink
WIP: merge fallback encoding feature from hound-search#388
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Pruzhanskiy committed Nov 21, 2023
1 parent 4cbc3cf commit 93c90e3
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 37 deletions.
27 changes: 16 additions & 11 deletions codesearch/index/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package index

import (
"errors"
"fmt"
"io"
"io/ioutil"
Expand All @@ -14,6 +15,7 @@ import (
"unsafe"

"github.com/hound-search/hound/codesearch/sparse"
"golang.org/x/text/encoding"
)

// Index writing. See read.go for details of on-disk format.
Expand Down Expand Up @@ -123,17 +125,17 @@ func (ix *IndexWriter) AddFile(name string) {
func (ix *IndexWriter) Add(name string, f io.Reader) string {
ix.trigram.Reset()
var (
c = byte(0) //nolint
c = byte(0) //nolint
i = 0
buf = ix.inbuf[:0]
tv = uint32(0)
n = int64(0)
linelen = 0
numLines = 0
longLines = 0
skipReason = "" //nolint
skipReason = "" //nolint
)

const invalidUTF8 = "Invalid UTF-8"
for {
tv = (tv << 8) & (1<<24 - 1)
if i >= len(buf) {
Expand All @@ -144,6 +146,9 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string {
break
}
log.Printf("%s: %v\n", name, err)
if errors.Is(err, encoding.ErrInvalidUTF8) {
return invalidUTF8
}
return ""
}
log.Printf("%s: 0-length read\n", name)
Expand All @@ -159,7 +164,7 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string {
ix.trigram.Add(tv)
}
if !validUTF8((tv>>8)&0xFF, tv&0xFF) {
skipReason = "Invalid UTF-8"
skipReason = invalidUTF8
if ix.LogSkip {
log.Printf("%s: %s\n", name, skipReason)
}
Expand Down Expand Up @@ -246,7 +251,7 @@ func (ix *IndexWriter) Flush() {

os.Remove(ix.nameData.name)
for _, d := range ix.postData {
unmmap(d) //nolint
unmmap(d) //nolint
}
for _, f := range ix.postFile {
f.Close()
Expand Down Expand Up @@ -310,7 +315,7 @@ func (ix *IndexWriter) flushPost() {
}

ix.post = ix.post[:0]
w.Seek(0, 0) //nolint
w.Seek(0, 0) //nolint
ix.postFile = append(ix.postFile, w)
}

Expand Down Expand Up @@ -368,7 +373,7 @@ type postChunk struct {
m []postEntry // remaining entries after e
}

const postBuf = 4096 //nolint
const postBuf = 4096 //nolint

// A postHeap is a heap (priority queue) of postChunks.
type postHeap struct {
Expand All @@ -388,7 +393,7 @@ func (h *postHeap) addMem(x []postEntry) {

// step reads the next entry from ch and saves it in ch.e.
// It returns false if ch is over.
func (h *postHeap) step(ch *postChunk) bool { //nolint
func (h *postHeap) step(ch *postChunk) bool { //nolint
old := ch.e
m := ch.m
if len(m) == 0 {
Expand All @@ -414,7 +419,7 @@ func (h *postHeap) add(ch *postChunk) {
}

// empty reports whether the postHeap is empty.
func (h *postHeap) empty() bool { //nolint
func (h *postHeap) empty() bool { //nolint
return len(h.ch) == 0
}

Expand Down Expand Up @@ -492,7 +497,7 @@ type bufWriter struct {
name string
file *os.File
buf []byte
tmp [8]byte //nolint
tmp [8]byte //nolint
}

// bufCreate creates a new file with the given name and returns a
Expand Down Expand Up @@ -578,7 +583,7 @@ func (b *bufWriter) flush() {
func (b *bufWriter) finish() *os.File {
b.flush()
f := b.file
f.Seek(0, 0) //nolint
f.Seek(0, 0) //nolint
return f
}

Expand Down
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Repo struct {
EnablePollUpdates *bool `json:"enable-poll-updates"`
EnablePushUpdates *bool `json:"enable-push-updates"`
AutoGeneratedFiles []string `json:"auto-generated-files"`
FallbackEncoding string `json:"fallback-encoding"`
}

// Used for interpreting the config value for fields that use *bool. If a value
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ go 1.16

require (
github.com/blang/semver/v4 v4.0.0
golang.org/x/mod v0.10.0
golang.org/x/mod v0.14.0
golang.org/x/text v0.14.0
)
25 changes: 23 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,46 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk=
golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0=
golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
64 changes: 49 additions & 15 deletions index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (

"github.com/hound-search/hound/codesearch/index"
"github.com/hound-search/hound/codesearch/regexp"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)

const (
Expand All @@ -37,6 +39,7 @@ type IndexOptions struct {
ExcludeDotFiles bool
SpecialFiles []string
AutoGeneratedFiles []string
FallbackEnc encoding.Encoding
}

type SearchOptions struct {
Expand Down Expand Up @@ -256,7 +259,7 @@ func (n *Index) Search(pat string, opt *SearchOptions) (*SearchResponse, error)
}, nil
}

func isTextFile(filename string) (bool, error) {
func isTextFile(filename string) (isText bool, err error) {
buf := make([]byte, filePeekSize)
r, err := os.Open(filename)
if err != nil {
Expand All @@ -271,14 +274,14 @@ func isTextFile(filename string) (bool, error) {

buf = buf[:n]

if n < filePeekSize {
// read the whole file, must be valid.
return utf8.Valid(buf), nil
if n < filePeekSize && utf8.Valid(buf) || // read the whole file, must be valid.
n >= filePeekSize && validUTF8IgnoringPartialTrailingRune(buf) { // read a prefix, allow trailing partial runes.
return true, nil
}

// read a prefix, allow trailing partial runes.
return validUTF8IgnoringPartialTrailingRune(buf), nil

if isBinary(buf) {
return false, nil
}
return true, nil
}

// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed
Expand Down Expand Up @@ -307,28 +310,59 @@ func validUTF8IgnoringPartialTrailingRune(p []byte) bool {
return true
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) {
func isBinary(p []byte) bool {
for _, c := range p {
if c < 10 {
return true
}
}
return false
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string, fallbackEnc encoding.Encoding) (string, error) {
rel, err := filepath.Rel(src, path)
if err != nil {
return "", err
}

r, err := os.Open(path)
fh, err := os.Open(path)
if err != nil {
return "", err
}
defer r.Close()
defer fh.Close()

dup := filepath.Join(dst, "raw", rel)
w, err := os.Create(dup)
if err != nil {
return "", err
}
defer w.Close()

g := gzip.NewWriter(w)
defer g.Close()
r := io.Reader(fh)

// Without fallback encoding, assume UTF-8.
maybeValidated := r
if fallbackEnc != nil {
maybeValidated = transform.NewReader(r, encoding.UTF8Validator)
}
skipReason := ix.Add(rel, io.TeeReader(maybeValidated, g))
if fallbackEnc == nil || skipReason == "" || skipReason != "Invalid UTF-8" {
return skipReason, nil
}

// Reset, then try the fallback encoding.
if _, err = fh.Seek(0, 0); err != nil {
return skipReason, err
}
if _, err = w.Seek(0, 0); err != nil {
return skipReason, err
}
if err = w.Truncate(0); err != nil {
return skipReason, err
}
g.Reset(w)
r = fallbackEnc.NewDecoder().Reader(r)
return ix.Add(rel, io.TeeReader(r, g)), nil
}

Expand Down Expand Up @@ -426,20 +460,20 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error {
return nil
}

txt, err := isTextFile(path)
isText, err := isTextFile(path)
if err != nil {
return err
}

if !txt {
if !isText {
excluded = append(excluded, &ExcludedFile{
rel,
reasonNotText,
})
return nil
}

reasonForExclusion, err := addFileToIndex(ix, dst, src, path)
reasonForExclusion, err := addFileToIndex(ix, dst, src, path, opt.FallbackEnc)
if err != nil {
return err
}
Expand Down
37 changes: 35 additions & 2 deletions index/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import (
"path/filepath"
"runtime"
"testing"

"github.com/hound-search/hound/codesearch/index"
"golang.org/x/text/encoding/charmap"
)

const (
Expand Down Expand Up @@ -37,7 +40,7 @@ func TestSearch(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer ref.Remove() //nolint
defer ref.Remove() //nolint

// Make sure the metadata in the ref is good.
if ref.Rev != rev {
Expand Down Expand Up @@ -116,7 +119,7 @@ func TestRead(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer ref.Remove() //nolint
defer ref.Remove() //nolint

r, err := Read(ref.Dir())
if err != nil {
Expand All @@ -137,3 +140,33 @@ func TestRead(t *testing.T) {
}
defer idx.Close()
}

func TestFallbackEnc(t *testing.T) {
dst, err := ioutil.TempDir(os.TempDir(), "hound")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dst)
os.MkdirAll(filepath.Join(dst, "raw"), 0701)

ix := index.Create(filepath.Join(dst, "tri"))
defer ix.Close()

// { for i in $(seq 0 $(( 2048 / 43 ))); do echo '2048 byte of ASCII to fill the peek buffer'; done; echo ''; echo 'árvíztűrő tükörfúrógép' |iconv -f UTF8 -t ISO8859-2; } > testdata/iso8859_2.txt'))
const src = "testdata"
const path = "iso8859_2.txt"
skipReason, err := addFileToIndex(ix, dst, src, filepath.Join(src, path), nil)
if err != nil {
t.Fatal(err)
}
if skipReason == "" {
t.Error("wanted skip, got success without fallback encoding")
}
skipReason, err = addFileToIndex(ix, dst, src, filepath.Join(src, path), charmap.ISO8859_2)
if err != nil {
t.Fatal(err)
}
if skipReason != "" {
t.Errorf("wanted success, got skip %q", skipReason)
}
}
Loading

0 comments on commit 93c90e3

Please sign in to comment.