Skip to content

Commit

Permalink
✨ implement billion scale data (#612)
Browse files Browse the repository at this point in the history
* implement billion scale data loader

Signed-off-by: Kosuke Morimoto <[email protected]>

* fix dataset to datasets

Signed-off-by: Kosuke Morimoto <[email protected]>

* implement billion scale loader

Signed-off-by: Kosuke Morimoto <[email protected]>

* fix benchmark timer

Signed-off-by: Kosuke Morimoto <[email protected]>

* fix error handling

Signed-off-by: Kosuke Morimoto <[email protected]>

* fix benchmark timers

Signed-off-by: Kosuke Morimoto <[email protected]>

* 🤖 Update license headers / Format go codes and yaml files

Signed-off-by: vdaas-ci <[email protected]>

* add x1b for docker build

Signed-off-by: Kosuke Morimoto <[email protected]>

* regenerate tests

Signed-off-by: Kosuke Morimoto <[email protected]>

* regenerate tests

Signed-off-by: Kosuke Morimoto <[email protected]>

* fix generated test

Signed-off-by: Kosuke Morimoto <[email protected]>

* by reviewdog

Signed-off-by: Kosuke Morimoto <[email protected]>

* by comments

Signed-off-by: Kosuke Morimoto <[email protected]>

* stop benchmark if error is occurred

Signed-off-by: Kosuke Morimoto <[email protected]>

* fix benchmark

Signed-off-by: Kosuke Morimoto <[email protected]>

Co-authored-by: vdaas-ci <[email protected]>
  • Loading branch information
kmrmt and vdaas-ci authored Sep 24, 2020
1 parent dd60c10 commit b96c5d0
Show file tree
Hide file tree
Showing 30 changed files with 5,078 additions and 2,758 deletions.
25 changes: 25 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,31 @@ CXXFLAGS ?= $(CFLAGS)
BENCH_DATASET_MD5S := $(eval BENCH_DATASET_MD5S := $(shell find $(BENCH_DATASET_MD5_DIR) -type f -regex ".*\.md5"))$(BENCH_DATASET_MD5S)
BENCH_DATASETS = $(BENCH_DATASET_MD5S:$(BENCH_DATASET_MD5_DIR)/%.md5=$(BENCH_DATASET_HDF5_DIR)/%.hdf5)

BENCH_LARGE_DATASET_BASE_DIR = $(BENCH_DATASET_BASE_DIR)/large/dataset

SIFT1B_ROOT_DIR = $(BENCH_LARGE_DATASET_BASE_DIR)/sift1b

SIFT1B_BASE_FILE = $(SIFT1B_ROOT_DIR)/bigann_base.bvecs
SIFT1B_LEARN_FILE = $(SIFT1B_ROOT_DIR)/bigann_learn.bvecs
SIFT1B_QUERY_FILE = $(SIFT1B_ROOT_DIR)/bigann_query.bvecs
SIFT1B_GROUNDTRUTH_DIR = $(SIFT1B_ROOT_DIR)/gnd

SIFT1B_BASE_URL = ftp://ftp.irisa.fr/local/texmex/corpus/

DEEP1B_ROOT_DIR = $(BENCH_LARGE_DATASET_BASE_DIR)/deep1b

DEEP1B_BASE_FILE = $(DEEP1B_ROOT_DIR)/deep1B_base.fvecs
DEEP1B_LEARN_FILE = $(DEEP1B_ROOT_DIR)/deep1B_learn.fvecs
DEEP1B_QUERY_FILE = $(DEEP1B_ROOT_DIR)/deep1B_queries.fvecs
DEEP1B_GROUNDTRUTH_FILE = $(DEEP1B_ROOT_DIR)/deep1B_groundtruth.ivecs

DEEP1B_BASE_DIR = $(DEEP1B_ROOT_DIR)/base
DEEP1B_BASE_CHUNK_FILES = $(shell printf "$(DEEP1B_BASE_DIR)/base_%02d\n" {0..36})
DEEP1B_LEARN_DIR = $(DEEP1B_ROOT_DIR)/learn
DEEP1B_LEARN_CHUNK_FILES = $(shell printf "$(DEEP1B_LEARN_DIR)/learn_%02d\n" {0..13})

DEEP1B_API_URL = https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=https://yadi.sk/d/11eDCm7Dsn9GA&path=

DATASET_ARGS ?= identity-128
ADDRESS_ARGS ?= ""

Expand Down
40 changes: 40 additions & 0 deletions Makefile.d/bench.mk
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ $(BENCH_DATASET_HDF5_DIR):
$(call mkdir, $@)
$(call rm, -rf, $@/*)

%.large_dataset_dir:
@test -f $* || mkdir -p $*

$(SIFT1B_BASE_FILE) $(SIFT1B_LEARN_FILE) $(SIFT1B_QUERY_FILE): | $(SIFT1B_ROOT_DIR).large_dataset_dir
test -f $@ || curl -fsSL $(SIFT1B_BASE_URL)$(subst $(SIFT1B_ROOT_DIR)/,,$@).gz | gunzip -d > $@

$(SIFT1B_GROUNDTRUTH_DIR): | $(SIFT1B_ROOT_DIR).large_dataset_dir
test -f $@ || curl -fsSL $(SIFT1B_BASE_URL)bigann_gnd.tar.gz | tar -C $(SIFT1B_ROOT_DIR) -zx

$(DEEP1B_GROUNDTRUTH_FILE) $(DEEP1B_QUERY_FILE) $(DEEP1B_BASE_CHUNK_FILES) $(DEEP1B_LEARN_CHUNK_FILES): | $(DEEP1B_ROOT_DIR).large_dataset_dir
test -f $@ || curl -fsSL "$(shell curl -fsSL "$(DEEP1B_API_URL)$(subst $(DEEP1B_ROOT_DIR),,$@)" | sed -e 's/^{\(.*\)}$$/\1/' | tr ',' '\n' | grep href | cut -d ':' -f 2- | tr -d '"')" -o $@

$(DEEP1B_BASE_FILE): | $(DEEP1B_BASE_DIR).large_dataset_dir $(DEEP1B_BASE_CHUNK_FILES)
cat $(DEEP1B_BASE_CHUNK_FILES) > $@

$(DEEP1B_LEARN_FILE): | $(DEEP1B_LEARN_DIR).large_dataset_dir $(DEEP1B_LEARN_CHUNK_FILES)
cat $(DEEP1B_LEARN_CHUNK_FILES) > $@

.PHONY: bench/datasets
## fetch datasets for benchmark
bench/datasets: $(BENCH_DATASETS)
Expand All @@ -45,6 +63,28 @@ bench/datasets/md5dir/print:
bench/datasets/hdf5dir/print:
@echo $(BENCH_DATASET_HDF5_DIR)

.PHONY: bench/datasets/large
## fetch large datasets for benchmark
bench/datasets/large: \
bench/datasets/large/sift1b \
bench/datasets/large/deep1b

.PHONY: bench/datasets/large/sift1b
## fetch sift1b dataset for benchmark
bench/datasets/large/sift1b: \
$(SIFT1B_BASE_FILE) \
$(SIFT1B_LEARN_FILE) \
$(SIFT1B_QUERY_FILE) \
$(SIFT1B_GROUNDTRUTH_DIR)

.PHONY: bench/datasets/large/deep1b
## fetch deep1b dataset for benchmark
bench/datasets/large/deep1b: \
$(DEEP1B_BASE_FILE) \
$(DEEP1B_LEARN_FILE) \
$(DEEP1B_QUERY_FILE) \
$(DEEP1B_GROUNDTRUTH_FILE)

.PHONY: bench
## run all benchmarks
bench: \
Expand Down
3 changes: 3 additions & 0 deletions dockers/tools/cli/loadtest/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ COPY pkg/${PKG} .
WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/cmd/${PKG}
COPY cmd/${PKG} .

WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/hack/benchmark/assets/x1b
COPY hack/benchmark/assets/x1b .

WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}
COPY versions/GO_VERSION .
COPY versions/VALD_VERSION .
Expand Down
2 changes: 2 additions & 0 deletions hack/benchmark/assets/large/dataset/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
197 changes: 197 additions & 0 deletions hack/benchmark/assets/x1b/loader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
//
// Copyright (C) 2019-2020 Vdaas.org Vald team ( kpango, rinx, kmrmt )
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package x1b

import (
"os"
"path/filepath"
"syscall"
"unsafe"

"github.com/vdaas/vald/internal/errors"
)

const (
headerSize = 4
)

var (
ErrOutOfBounds = errors.New("out of bounds")
ErrUnsupportedFileType = errors.New("unsupported file type")
)

type BillionScaleVectors interface {
Load(i int) (interface{}, error)
Dimension() int
Size() int
Close() error
}

type Uint8Vectors interface {
BillionScaleVectors
LoadUint8(i int) ([]uint8, error)
}

type FloatVectors interface {
BillionScaleVectors
LoadFloat32(i int) ([]float32, error)
}

type Int32Vectors interface {
BillionScaleVectors
LoadInt32(i int) ([]int32, error)
}

type file struct {
mem []byte
dim int
size int
block int
}

type bvecs struct {
*file
}
type fvecs struct {
*file
}
type ivecs struct {
*file
}

func open(fname string, elementSize int) (f *file, err error) {
fp, err := os.Open(fname)
if err != nil {
return nil, err
}
defer func() {
if e := fp.Close(); e != nil {
err = errors.Wrap(err, e.Error())
}
}()

fi, err := fp.Stat()
if err != nil {
return nil, err
}

mem, err := syscall.Mmap(int(fp.Fd()), 0, int(fi.Size()), syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return nil, err
}

dim := int(*(*int32)(unsafe.Pointer(&mem[0])))
block := headerSize + dim*elementSize
return &file{
mem: mem,
dim: dim,
size: len(mem) / block,
block: block,
}, nil
}

func (f *file) Close() error {
return syscall.Munmap(f.mem)
}

func (f *file) load(i int) ([]byte, error) {
if i >= f.size {
return nil, ErrOutOfBounds
}

return f.mem[i*f.block+headerSize : (i+1)*f.block], nil
}

func (f *file) Dimension() int {
return f.dim
}

func (f *file) Size() int {
return f.size
}

func (bv *bvecs) LoadUint8(i int) ([]uint8, error) {
buf, err := bv.load(i)
if err != nil {
return nil, err
}
return ((*[1 << 26]uint8)(unsafe.Pointer(&buf[0])))[:bv.dim:bv.dim], nil
}

func (bv *bvecs) Load(i int) (interface{}, error) {
return bv.LoadUint8(i)
}

func (fv *fvecs) LoadFloat32(i int) ([]float32, error) {
buf, err := fv.load(i)
if err != nil {
return nil, err
}
return ((*[1 << 26]float32)(unsafe.Pointer(&buf[0])))[:fv.dim:fv.dim], nil
}

func (fv *fvecs) Load(i int) (interface{}, error) {
return fv.LoadFloat32(i)
}

func (iv *ivecs) LoadInt32(i int) ([]int32, error) {
buf, err := iv.load(i)
if err != nil {
return nil, err
}
return ((*[1 << 26]int32)(unsafe.Pointer(&buf[0])))[:iv.dim:iv.dim], nil
}

func (iv *ivecs) Load(i int) (interface{}, error) {
return iv.LoadInt32(i)
}

func NewUint8Vectors(fname string) (Uint8Vectors, error) {
f, err := open(fname, 1)
if err != nil {
return nil, err
}
return &bvecs{f}, nil
}

func NewFloatVectors(fname string) (FloatVectors, error) {
f, err := open(fname, 4)
if err != nil {
return nil, err
}
return &fvecs{f}, nil
}

func NewInt32Vectors(fname string) (Int32Vectors, error) {
f, err := open(fname, 4)
if err != nil {
return nil, err
}
return &ivecs{f}, nil
}

func Open(fname string) (BillionScaleVectors, error) {
switch filepath.Ext(fname) {
case ".bvecs":
return NewUint8Vectors(fname)
case ".fvecs":
return NewFloatVectors(fname)
case ".ivecs":
return NewInt32Vectors(fname)
default:
return nil, ErrUnsupportedFileType
}
}
Loading

0 comments on commit b96c5d0

Please sign in to comment.