Skip to content

Commit

Permalink
test updates, continuing cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ifadams committed Aug 22, 2024
1 parent 374a3ec commit 82a979f
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 37 deletions.
81 changes: 44 additions & 37 deletions src/DescriptorsCommand.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,16 +352,6 @@ long AddDescriptor::insert_descriptor(const std::string &blob,

VCL::DescriptorSet *desc_set = _dm->get_descriptors_handler(set_path);

//TODO this check no longer applies, should move it elsewhere
/*if (blob.length() / 4 != dim) {
std::cerr << "AddDescriptor::insert_descriptor: ";
std::cerr << "Dimensions mismatch: ";
std::cerr << blob.length() / 4 << " " << dim << std::endl;
error["info"] = "Blob Dimensions Mismatch";
return -1;
}*/


if (!label.empty()) {
long label_id = desc_set->get_label_id(label);
long *label_ptr = &label_id;
Expand Down Expand Up @@ -449,7 +439,6 @@ int AddDescriptor::add_single_descriptor(PMGDQuery &query,
retrieve_aws_descriptorSet(set_path);
}

//TODO modify insert descriptor to handle batches
long id = insert_descriptor(blob, set_path, 1, label, error);

if (id < 0) {
Expand Down Expand Up @@ -517,6 +506,7 @@ int AddDescriptor::add_descriptor_batch(PMGDQuery &query,
const std::string &blob, int grp_id,
Json::Value &error){

const int FOUR_BYTE_INT = 4;
int expected_blb_size;
int nr_expected_descs;
int dimensions;
Expand All @@ -525,7 +515,7 @@ int AddDescriptor::add_descriptor_batch(PMGDQuery &query,
const Json::Value &cmd = jsoncmd[_cmd_name];
const std::string set_name = cmd["set"].asString();

Json::Value props = get_value<Json::Value>(cmd, "properties");
//Json::Value props = get_value<Json::Value>(cmd, "properties");

This comment has been minimized.

Copy link
@araghuna1

araghuna1 Aug 29, 2024

would be useful for a client to get back at least a warning if both properties & batch_properties are passed (instead of silently ignoring properties)..
But I'm not sure how much of a hassle it is to send back such a warning in the returned results. So will defer to your judgement on this.

This comment has been minimized.

Copy link
@ifadams

ifadams Aug 29, 2024

Author Contributor

It'd be kind of a mess, I could theoretically just fail the transaction as "Hey, don't use both". Just returning a warning may be a bit awkward. We may want to make an issue to keep track of this.


//extract properties list and get filepath/object location of set
Json::Value prop_list = get_value<Json::Value>(cmd, "batch_properties");
Expand All @@ -538,7 +528,7 @@ int AddDescriptor::add_descriptor_batch(PMGDQuery &query,
}

std::string label = get_value<std::string>(cmd, "label", "None");
props[VDMS_DESC_LABEL_PROP] = label;
//props[VDMS_DESC_LABEL_PROP] = label;

// retrieve the descriptor set from AWS here
// operations are currently done in memory with no subsequent write to disk
Expand All @@ -550,18 +540,17 @@ int AddDescriptor::add_descriptor_batch(PMGDQuery &query,
// Note dimensionse are based on a 32 bit integer, hence the /4 math on size
// as the string blob is sized in 8 bit ints.
nr_expected_descs = prop_list.size();
expected_blb_size = nr_expected_descs * dimensions * 4;
expected_blb_size = nr_expected_descs * dimensions * FOUR_BYTE_INT;

//Verify length of input is matching expectations
if (blob.length() != expected_blb_size) {
std::cerr << "AddDescriptor::insert_descriptor: ";
std::cerr << "Expectected Blob Length Does Not Match Input ";
std::cerr << blob.length() << " != " << expected_blb_size << std::endl;
std::cerr << "Expected Blob Length Does Not Match Input ";
std::cerr << "Input Length: " <<blob.length() << " != " << "Expected Length: " expected_blb_size << std::endl;
error["info"] = "FV Input Length Mismatch";
return -1;
}

//TODO modify insert descriptor to handle batches
long id = insert_descriptor(blob, set_path, nr_expected_descs, label, error);

if (id < 0) {
Expand All @@ -572,44 +561,62 @@ int AddDescriptor::add_descriptor_batch(PMGDQuery &query,
std::uintmax_t n = fs::remove_all(set_path);
std::cout << "Deleted " << n << " files or directories\n";
}

error["info"] = "FV Index Insert Failed";
return -1;
}

//get reference tag for source node for ID
// Loop over properties list, add relevant query, link, and edges for each
// It passed the checker, so it exists.
int set_ref = query.get_available_reference();

Json::Value link;
Json::Value results;
Json::Value list_arr;
list_arr.append(VDMS_DESC_SET_PATH_PROP);
list_arr.append(VDMS_DESC_SET_DIM_PROP);
results["list"] = list_arr;

//constraints for getting set node to link to.
Json::Value constraints;
Json::Value name_arr;
name_arr.append("==");
name_arr.append(set_name);
constraints[VDMS_DESC_SET_NAME_PROP] = name_arr;
bool unique = true;

// Query set node-We only need to do this once, outside of the loop
query.QueryNode(set_ref, VDMS_DESC_SET_TAG, link, constraints, results,
unique);

for(int i=0; i < nr_expected_descs; i++) {
int node_ref = query.get_available_reference();
Json::Value cur_props;
cur_props = prop_list[i];
//TODO Note using iterator to modify ID return, we're gonna want to watch this closely.
cur_props[VDMS_DESC_ID_PROP] = Json::Int64(id+i);
cur_props[VDMS_DESC_LABEL_PROP] = label;

query.AddNode(node_ref, VDMS_DESC_TAG, cur_props, Json::nullValue);

// It passed the checker, so it exists.
int set_ref = query.get_available_reference();

Json::Value link;
Json::Value results;
Json::Value list_arr;
list_arr.append(VDMS_DESC_SET_PATH_PROP);
list_arr.append(VDMS_DESC_SET_DIM_PROP);
results["list"] = list_arr;
//int set_ref = query.get_available_reference();
//Json::Value link;
//Json::Value results;
//Json::Value list_arr;
//list_arr.append(VDMS_DESC_SET_PATH_PROP);
//list_arr.append(VDMS_DESC_SET_DIM_PROP);
//results["list"] = list_arr;

//constraints for getting set node to link to.
Json::Value constraints;
Json::Value name_arr;
name_arr.append("==");
name_arr.append(set_name);
constraints[VDMS_DESC_SET_NAME_PROP] = name_arr;
//Json::Value constraints;
//Json::Value name_arr;
//name_arr.append("==");
//name_arr.append(set_name);
//constraints[VDMS_DESC_SET_NAME_PROP] = name_arr;

bool unique = true;
//bool unique = true;

// Query set node-We only need to do this once, outside of the loop TODO MOVE
query.QueryNode(set_ref, VDMS_DESC_SET_TAG, link, constraints, results,
unique);
//query.QueryNode(set_ref, VDMS_DESC_SET_TAG, link, constraints, results,
// unique);

//note this implicitly means that every node of a batch uses the same link
if (cmd.isMember("link")) {
Expand Down
86 changes: 86 additions & 0 deletions tests/python/TestDescriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,92 @@ def test_AddSetAndInsertBatch(self):

self.disconnect(db)

def test_AddBatchAndFindKNN(self):

db = self.create_connection()

# Create and verify descriptor set
trans_list = []
trans_dict = {}
desc_set = {}
desc_set["engine"] = "FaissFlat"
desc_set["metric"] = "L2"
desc_set["name"] = "knn_batch_set"
desc_set["dimensions"] = 128
trans_dict["AddDescriptorSet"] = desc_set

trans_list.append(trans_dict)

response, img_array = db.query(trans_list)
self.assertEqual(response[0]["AddDescriptorSet"]["status"],0)

# Descriptor Set Created, now lets create a batch to insert
# first lets make a big combined blob representing the inserted descriptor
trans = []
blobs = []
nr_dims = 128
batch_size = 5
desc_blob = []
x = np.ones(nr_dims * batch_size)
for i in range(batch_size):
x[2 + (i*nr_dims)] = 2.34 + i * 20

x = x.astype("float32")
desc_blob.append(x.tobytes())

properties_list = []
for x in range(batch_size):
props = {"myid": x + 200}
properties_list.append(props)

descriptor = {}
descriptor["set"] = "knn_batch_set"
descriptor["batch_properties"] = properties_list

query = {}
query["AddDescriptor"] = descriptor
trans.append(query)
blobs.append(desc_blob)

response, img_array = db.query(trans, blobs)
self.assertEqual(response[0]["AddDescriptor"]["status"], 0)

### Now try to find a KNN
kn = 3
finddescriptor = {}
finddescriptor["set"] = "knn_batch_set"

results = {}
results["list"] = ["myid", "_id", "_distance"]
results["blob"] = True
finddescriptor["results"] = results
finddescriptor["k_neighbors"] = kn

query = {}
query["FindDescriptor"] = finddescriptor

all_queries = []
all_queries.append(query)

descriptor_blob = []
x = np.ones(128)
x[2] = x[2] = 2.34 + 1 * 20 # 2.34 + 1*20

This comment has been minimized.

Copy link
@araghuna1

araghuna1 Aug 29, 2024

is this a typo? x[2] =x[2] =

This comment has been minimized.

Copy link
@ifadams

ifadams Aug 29, 2024

Author Contributor

Appears to be, but one that has no effect. I'll tweak.

x = x.astype("float32")
descriptor_blob.append(x.tobytes())

response, blob_array = db.query(all_queries, [descriptor_blob])

self.assertEqual(len(blob_array), kn)
self.assertEqual(descriptor_blob[0], blob_array[0])

# Check success
self.assertEqual(response[0]["FindDescriptor"]["status"], 0)
self.assertEqual(response[0]["FindDescriptor"]["returned"], kn)
self.assertEqual(response[0]["FindDescriptor"]["entities"][0]["_distance"], 0)
self.assertEqual(response[0]["FindDescriptor"]["entities"][1]["_distance"], 400)
self.assertEqual(response[0]["FindDescriptor"]["entities"][2]["_distance"], 400)
self.disconnect(db)


def test_classifyDescriptor(self):
db = self.create_connection()
Expand Down

0 comments on commit 82a979f

Please sign in to comment.