-
Notifications
You must be signed in to change notification settings - Fork 110
Get all chunk references for a given file #1185
Changes from all commits
9104193
eb2d20f
3dc9065
7b56fd2
ce973dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ package storage | |
import ( | ||
"context" | ||
"io" | ||
"sort" | ||
) | ||
|
||
/* | ||
|
@@ -96,3 +97,42 @@ func (f *FileStore) Store(ctx context.Context, data io.Reader, size int64, toEnc | |
func (f *FileStore) HashSize() int { | ||
return f.hashFunc().Size() | ||
} | ||
|
||
// Public API. This endpoint returns all chunk hashes (only) for a given file | ||
func (f *FileStore) GetAllReferences(ctx context.Context, data io.Reader, toEncrypt bool) (addrs AddressCollection, err error) { | ||
// create a special kind of putter, which only will store the references | ||
putter := &HashExplorer{ | ||
hasherStore: NewHasherStore(f.ChunkStore, f.hashFunc, toEncrypt), | ||
References: make([]Reference, 0), | ||
} | ||
// do the actual splitting anyway, no way around it | ||
_, _, err = PyramidSplit(ctx, data, putter, putter) | ||
if err != nil { | ||
return nil, err | ||
} | ||
// collect all references | ||
addrs = NewAddressCollection(0) | ||
for _, ref := range putter.References { | ||
addrs = append(addrs, Address(ref)) | ||
} | ||
sort.Sort(addrs) | ||
return addrs, nil | ||
} | ||
|
||
// HashExplorer is a special kind of putter which will only store chunk references | ||
type HashExplorer struct { | ||
*hasherStore | ||
References []Reference | ||
} | ||
|
||
// HashExplorer's Put will add just the chunk hashes to its `References` | ||
func (he *HashExplorer) Put(ctx context.Context, chunkData ChunkData) (Reference, error) { | ||
// Need to do the actual Put, which returns the references | ||
ref, err := he.hasherStore.Put(ctx, chunkData) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh my, it's pretty clumsy having to hash everything twice... I wonder why the pyramidsplitter only returns the data, not the reference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nolash not everything is hashed twice - and the points in time for the requests are separated. We use We use Is the use case clear now? |
||
if err != nil { | ||
return nil, err | ||
} | ||
// internally store the reference | ||
he.References = append(he.References, ref) | ||
return ref, nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,3 +173,39 @@ func testFileStoreCapacity(toEncrypt bool, t *testing.T) { | |
t.Fatalf("Comparison error after clearing memStore.") | ||
} | ||
} | ||
|
||
// TestGetAllReferences only tests that GetAllReferences returns an expected | ||
// number of references for a given file | ||
func TestGetAllReferences(t *testing.T) { | ||
tdb, cleanup, err := newTestDbStore(false, false) | ||
defer cleanup() | ||
if err != nil { | ||
t.Fatalf("init dbStore failed: %v", err) | ||
} | ||
db := tdb.LDBStore | ||
memStore := NewMemStore(NewDefaultStoreParams(), db) | ||
localStore := &LocalStore{ | ||
memStore: memStore, | ||
DbStore: db, | ||
} | ||
fileStore := NewFileStore(localStore, NewFileStoreParams()) | ||
|
||
checkRefs := func(dataSize int, expectedLen int) { | ||
slice := testutil.RandomBytes(1, dataSize) | ||
|
||
addrs, err := fileStore.GetAllReferences(context.Background(), bytes.NewReader(slice), false) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
if len(addrs) != expectedLen { | ||
t.Fatalf("Expected reference array length to be %d, but is %d", expectedLen, len(addrs)) | ||
} | ||
} | ||
|
||
// testRuns[i] and expectedLen[i] are dataSize and expected length respectively | ||
testRuns := []int{1024, 8192, 16000, 30000, 1000000} | ||
expectedLens := []int{1, 3, 5, 9, 248} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am only aware of one bug in the pyramidchunker, which occurs - as far as I remember, although this is awhile ago - when you have all batches in a tree filled plus one chunk. Neither 247 or 248 match the count of such a configuration. Meanwhile, for 1000000 bytes 248 should be correct:
If you can reproduce this anomaly using the same data (randomized but from a fixed seed), then perhaps we could discover which chunk goes missing, and if it's the same one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is indeed flaky, alternating between 247 and 248, we must find what is going on... |
||
for i, r := range testRuns { | ||
checkRefs(r, expectedLens[i]) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it intentional that the references are returned in arbitrary order?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@nolash is right - we should sort the before returning them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And what sort order should this be? By value? By hierarchy? If the latter, how to achieve that when the putter will get the hashes in no specific order?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alphabetical - so that when we have a tool to query N nodes whether they have a list of chunks, they are all sorted in the same order and we can quickly merge and check N such lists.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I will add alphabetical ascending sort order
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, alphabetical order would be the best.