Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement billion scale data #612

Merged
merged 15 commits into from
Sep 24, 2020
25 changes: 25 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,31 @@ CXXFLAGS ?= $(CFLAGS)
BENCH_DATASET_MD5S := $(eval BENCH_DATASET_MD5S := $(shell find $(BENCH_DATASET_MD5_DIR) -type f -regex ".*\.md5"))$(BENCH_DATASET_MD5S)
BENCH_DATASETS = $(BENCH_DATASET_MD5S:$(BENCH_DATASET_MD5_DIR)/%.md5=$(BENCH_DATASET_HDF5_DIR)/%.hdf5)

BENCH_LARGE_DATASET_BASE_DIR = $(BENCH_DATASET_BASE_DIR)/large/dataset

SIFT1B_ROOT_DIR = $(BENCH_LARGE_DATASET_BASE_DIR)/sift1b

SIFT1B_BASE_FILE = $(SIFT1B_ROOT_DIR)/bigann_base.bvecs
SIFT1B_LEARN_FILE = $(SIFT1B_ROOT_DIR)/bigann_learn.bvecs
SIFT1B_QUERY_FILE = $(SIFT1B_ROOT_DIR)/bigann_query.bvecs
SIFT1B_GROUNDTRUTH_DIR = $(SIFT1B_ROOT_DIR)/gnd

SIFT1B_BASE_URL = ftp://ftp.irisa.fr/local/texmex/corpus/

DEEP1B_ROOT_DIR = $(BENCH_LARGE_DATASET_BASE_DIR)/deep1b

DEEP1B_BASE_FILE = $(DEEP1B_ROOT_DIR)/deep1B_base.fvecs
DEEP1B_LEARN_FILE = $(DEEP1B_ROOT_DIR)/deep1B_learn.fvecs
DEEP1B_QUERY_FILE = $(DEEP1B_ROOT_DIR)/deep1B_queries.fvecs
DEEP1B_GROUNDTRUTH_FILE = $(DEEP1B_ROOT_DIR)/deep1B_groundtruth.ivecs

DEEP1B_BASE_DIR = $(DEEP1B_ROOT_DIR)/base
DEEP1B_BASE_CHUNK_FILES = $(shell printf "$(DEEP1B_BASE_DIR)/base_%02d\n" {0..36})
DEEP1B_LEARN_DIR = $(DEEP1B_ROOT_DIR)/learn
DEEP1B_LEARN_CHUNK_FILES = $(shell printf "$(DEEP1B_LEARN_DIR)/learn_%02d\n" {0..13})

DEEP1B_API_URL = https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=https://yadi.sk/d/11eDCm7Dsn9GA&path=

DATASET_ARGS ?= identity-128
ADDRESS_ARGS ?= ""

Expand Down
40 changes: 40 additions & 0 deletions Makefile.d/bench.mk
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ $(BENCH_DATASET_HDF5_DIR):
$(call mkdir, $@)
$(call rm, -rf, $@/*)

%.large_dataset_dir:
@test -f $* || mkdir -p $*

$(SIFT1B_BASE_FILE) $(SIFT1B_LEARN_FILE) $(SIFT1B_QUERY_FILE): | $(SIFT1B_ROOT_DIR).large_dataset_dir
test -f $@ || curl -fsSL $(SIFT1B_BASE_URL)$(subst $(SIFT1B_ROOT_DIR)/,,$@).gz | gunzip -d > $@

$(SIFT1B_GROUNDTRUTH_DIR): | $(SIFT1B_ROOT_DIR).large_dataset_dir
test -f $@ || curl -fsSL $(SIFT1B_BASE_URL)bigann_gnd.tar.gz | tar -C $(SIFT1B_ROOT_DIR) -zx

$(DEEP1B_GROUNDTRUTH_FILE) $(DEEP1B_QUERY_FILE) $(DEEP1B_BASE_CHUNK_FILES) $(DEEP1B_LEARN_CHUNK_FILES): | $(DEEP1B_ROOT_DIR).large_dataset_dir
test -f $@ || curl -fsSL "$(shell curl -fsSL "$(DEEP1B_API_URL)$(subst $(DEEP1B_ROOT_DIR),,$@)" | sed -e 's/^{\(.*\)}$$/\1/' | tr ',' '\n' | grep href | cut -d ':' -f 2- | tr -d '"')" -o $@

$(DEEP1B_BASE_FILE): | $(DEEP1B_BASE_DIR).large_dataset_dir $(DEEP1B_BASE_CHUNK_FILES)
cat $(DEEP1B_BASE_CHUNK_FILES) > $@

$(DEEP1B_LEARN_FILE): | $(DEEP1B_LEARN_DIR).large_dataset_dir $(DEEP1B_LEARN_CHUNK_FILES)
cat $(DEEP1B_LEARN_CHUNK_FILES) > $@

.PHONY: bench/datasets
## fetch datasets for benchmark
bench/datasets: $(BENCH_DATASETS)
Expand All @@ -45,6 +63,28 @@ bench/datasets/md5dir/print:
bench/datasets/hdf5dir/print:
@echo $(BENCH_DATASET_HDF5_DIR)

.PHONY: bench/datasets/large
## fetch large datasets for benchmark
bench/datasets/large: \
bench/datasets/large/sift1b \
bench/datasets/large/deep1b

.PHONY: bench/datasets/large/sift1b
## fetch sift1b dataset for benchmark
bench/datasets/large/sift1b: \
$(SIFT1B_BASE_FILE) \
$(SIFT1B_LEARN_FILE) \
$(SIFT1B_QUERY_FILE) \
$(SIFT1B_GROUNDTRUTH_DIR)

.PHONY: bench/datasets/large/deep1b
## fetch deep1b dataset for benchmark
bench/datasets/large/deep1b: \
$(DEEP1B_BASE_FILE) \
$(DEEP1B_LEARN_FILE) \
$(DEEP1B_QUERY_FILE) \
$(DEEP1B_GROUNDTRUTH_FILE)

.PHONY: bench
## run all benchmarks
bench: \
Expand Down
3 changes: 3 additions & 0 deletions dockers/tools/cli/loadtest/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ COPY pkg/${PKG} .
WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/cmd/${PKG}
COPY cmd/${PKG} .

WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/hack/benchmark/assets/x1b
COPY hack/benchmark/assets/x1b .

WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}
COPY versions/GO_VERSION .
COPY versions/VALD_VERSION .
Expand Down
2 changes: 2 additions & 0 deletions hack/benchmark/assets/large/dataset/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
197 changes: 197 additions & 0 deletions hack/benchmark/assets/x1b/loader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
//
// Copyright (C) 2019-2020 Vdaas.org Vald team ( kpango, rinx, kmrmt )
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package x1b

import (
"os"
"path/filepath"
"syscall"
"unsafe"

"github.com/vdaas/vald/internal/errors"
)

const (
headerSize = 4
)

var (
ErrOutOfBounds = errors.New("out of bounds")
ErrUnsupportedFileType = errors.New("unsupported file type")
)

type BillionScaleVectors interface {
Load(i int) (interface{}, error)
Dimension() int
Size() int
Close() error
}

type Uint8Vectors interface {
BillionScaleVectors
LoadUint8(i int) ([]uint8, error)
}

type FloatVectors interface {
BillionScaleVectors
LoadFloat32(i int) ([]float32, error)
}

type Int32Vectors interface {
BillionScaleVectors
LoadInt32(i int) ([]int32, error)
}

type file struct {
mem []byte
dim int
size int
block int
}

type bvecs struct {
*file
}
type fvecs struct {
*file
}
type ivecs struct {
*file
}

func open(fname string, elementSize int) (f *file, err error) {
fp, err := os.Open(fname)
if err != nil {
return nil, err
}
defer func() {
if e := fp.Close(); e != nil {
err = errors.Wrap(err, e.Error())
}
}()

fi, err := fp.Stat()
if err != nil {
return nil, err
}

mem, err := syscall.Mmap(int(fp.Fd()), 0, int(fi.Size()), syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return nil, err
}

dim := int(*(*int32)(unsafe.Pointer(&mem[0])))
block := headerSize + dim*elementSize
return &file{
mem: mem,
dim: dim,
size: len(mem) / block,
block: block,
}, nil
}

func (f *file) Close() error {
return syscall.Munmap(f.mem)
}

func (f *file) load(i int) ([]byte, error) {
if i >= f.size {
return nil, ErrOutOfBounds
}

return f.mem[i*f.block+headerSize : (i+1)*f.block], nil
}

func (f *file) Dimension() int {
return f.dim
}

func (f *file) Size() int {
return f.size
}

func (bv *bvecs) LoadUint8(i int) ([]uint8, error) {
buf, err := bv.load(i)
if err != nil {
return nil, err
}
return ((*[1 << 26]uint8)(unsafe.Pointer(&buf[0])))[:bv.dim:bv.dim], nil
}

func (bv *bvecs) Load(i int) (interface{}, error) {
return bv.LoadUint8(i)
}

func (fv *fvecs) LoadFloat32(i int) ([]float32, error) {
buf, err := fv.load(i)
if err != nil {
return nil, err
}
return ((*[1 << 26]float32)(unsafe.Pointer(&buf[0])))[:fv.dim:fv.dim], nil
}

func (fv *fvecs) Load(i int) (interface{}, error) {
return fv.LoadFloat32(i)
}

func (iv *ivecs) LoadInt32(i int) ([]int32, error) {
buf, err := iv.load(i)
if err != nil {
return nil, err
}
return ((*[1 << 26]int32)(unsafe.Pointer(&buf[0])))[:iv.dim:iv.dim], nil
}

func (iv *ivecs) Load(i int) (interface{}, error) {
return iv.LoadInt32(i)
}

func NewUint8Vectors(fname string) (Uint8Vectors, error) {
f, err := open(fname, 1)
if err != nil {
return nil, err
}
return &bvecs{f}, nil
}

func NewFloatVectors(fname string) (FloatVectors, error) {
f, err := open(fname, 4)
if err != nil {
return nil, err
}
return &fvecs{f}, nil
}

func NewInt32Vectors(fname string) (Int32Vectors, error) {
f, err := open(fname, 4)
if err != nil {
return nil, err
}
return &ivecs{f}, nil
}

func Open(fname string) (BillionScaleVectors, error) {
switch filepath.Ext(fname) {
case ".bvecs":
return NewUint8Vectors(fname)
case ".fvecs":
return NewFloatVectors(fname)
case ".ivecs":
return NewInt32Vectors(fname)
default:
return nil, ErrUnsupportedFileType
}
}
Loading