From 6cd35d6c0c712147d0ffab2e7b79fb5fa0e6d12f Mon Sep 17 00:00:00 2001 From: masajiro Date: Wed, 26 Jun 2019 10:59:37 +0900 Subject: [PATCH] v1.7.6 add jaccard distance --- README.md | 1 + VERSION | 2 +- lib/NGT/Capi.cpp | 12 ++++ lib/NGT/Capi.h | 2 + lib/NGT/Clustering.h | 2 +- lib/NGT/Command.cpp | 5 +- lib/NGT/Graph.cpp | 6 ++ lib/NGT/Graph.h | 2 + lib/NGT/Index.h | 3 + lib/NGT/ObjectSpace.h | 3 +- lib/NGT/ObjectSpaceRepository.h | 24 ++++++++ lib/NGT/PrimitiveComparator.h | 103 ++++++++++++++++++++++---------- python/README-ngtpy-jp.md | 1 + python/README-ngtpy.md | 1 + python/ngt/README.md | 2 +- python/ngt/base.py | 6 ++ python/setup.py | 4 +- python/src/ngtpy.cpp | 68 +++++++++++++++++++++ 18 files changed, 209 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 3f49f46..67d174b 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data **NGT** provides commands and a library for performing high-speed approximate nearest neighbor searches against a large volume of data (several million to several 10 million items of data) in high dimensional vector data space (several ten to several thousand dimensions). News +- 06/26/2019 Jaccard distance is available. (v1.7.6) - 06/10/2019 PyPI NGT package v1.7.5 is now available. - 01/17/2019 Python NGT can be installed via pip from PyPI. (v1.5.1) - 12/14/2018 [NGTQ](bin/ngtq/README.md) (NGT with Quantization) is now available. (v1.5.0) diff --git a/VERSION b/VERSION index 6a126f4..de28578 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.7.5 +1.7.6 diff --git a/lib/NGT/Capi.cpp b/lib/NGT/Capi.cpp index b5b7928..df27bb2 100644 --- a/lib/NGT/Capi.cpp +++ b/lib/NGT/Capi.cpp @@ -242,6 +242,18 @@ bool ngt_set_property_distance_type_hamming(NGTProperty prop, NGTError error) { return true; } +bool ngt_set_property_distance_type_jaccard(NGTProperty prop, NGTError error) { + if(prop == NULL){ + std::stringstream ss; + ss << "Capi : " << __FUNCTION__ << "() : parametor error: prop = " << prop; + operate_error_string_(ss, error); + return false; + } + + (*static_cast(prop)).distanceType = NGT::Index::Property::DistanceType::DistanceTypeJaccard; + return true; +} + bool ngt_set_property_distance_type_cosine(NGTProperty prop, NGTError error) { if(prop == NULL){ std::stringstream ss; diff --git a/lib/NGT/Capi.h b/lib/NGT/Capi.h index 2599389..1c9d8ff 100644 --- a/lib/NGT/Capi.h +++ b/lib/NGT/Capi.h @@ -72,6 +72,8 @@ bool ngt_set_property_distance_type_angle(NGTProperty, NGTError); bool ngt_set_property_distance_type_hamming(NGTProperty, NGTError); +bool ngt_set_property_distance_type_jaccard(NGTProperty, NGTError); + bool ngt_set_property_distance_type_cosine(NGTProperty, NGTError); bool ngt_set_property_distance_type_normalized_angle(NGTProperty, NGTError); diff --git a/lib/NGT/Clustering.h b/lib/NGT/Clustering.h index 60390bb..35e2d2e 100644 --- a/lib/NGT/Clustering.h +++ b/lib/NGT/Clustering.h @@ -188,7 +188,7 @@ namespace NGT { double csum = 0.0; float *x = a; float *y = b; - for (int i = 0; i < size; i++) { + for (size_t i = 0; i < size; i++) { double d = (double)*x++ - (double)*y++; csum += d * d; } diff --git a/lib/NGT/Command.cpp b/lib/NGT/Command.cpp index 9f91715..c7d2522 100644 --- a/lib/NGT/Command.cpp +++ b/lib/NGT/Command.cpp @@ -26,7 +26,7 @@ const string usage = "Usage: ngt create " "-d dimension [-p #-of-thread] [-i index-type(t|g)] [-g graph-type(a|k|b|o|i)] " "[-t truncation-edge-limit] [-E edge-size] [-S edge-size-for-search] [-L edge-size-limit] " - "[-e epsilon] [-o object-type(f|c)] [-D distance-function(1|2|a|A|h|c|C)] [-n #-of-inserted-objects] " + "[-e epsilon] [-o object-type(f|c)] [-D distance-function(1|2|a|A|h|j|c|C)] [-n #-of-inserted-objects] " "[-P path-adjustment-interval] [-B dynamic-edge-size-base] [-A object-alignment(t|f)] " "[-T build-time-limit] [-O outgoing x incoming] " "index(output) [data.tsv(input)]"; @@ -153,6 +153,9 @@ case 'h': property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeHamming; break; + case 'j': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeJaccard; + break; case 'c': property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeCosine; break; diff --git a/lib/NGT/Graph.cpp b/lib/NGT/Graph.cpp index abd4067..02f1148 100644 --- a/lib/NGT/Graph.cpp +++ b/lib/NGT/Graph.cpp @@ -119,6 +119,12 @@ NeighborhoodGraph::Search::hammingUint8(NeighborhoodGraph &graph, NGT::SearchCon { graph.searchReadOnlyGraph(sc, seeds); } + +void +NeighborhoodGraph::Search::jaccardUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds) +{ + graph.searchReadOnlyGraph(sc, seeds); +} #endif void diff --git a/lib/NGT/Graph.h b/lib/NGT/Graph.h index 4fe0603..c7cc266 100644 --- a/lib/NGT/Graph.h +++ b/lib/NGT/Graph.h @@ -277,6 +277,7 @@ namespace NGT { case NGT::ObjectSpace::Uint8: switch (dtype) { case NGT::ObjectSpace::DistanceTypeHamming : return hammingUint8; + case NGT::ObjectSpace::DistanceTypeJaccard : return jaccardUint8; case NGT::ObjectSpace::DistanceTypeL2 : return l2Uint8; case NGT::ObjectSpace::DistanceTypeL1 : return l1Uint8; default : return l2Uint8; @@ -290,6 +291,7 @@ namespace NGT { static void l1Float(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void l2Float(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void hammingUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); + static void jaccardUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void cosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void angleFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void normalizedCosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); diff --git a/lib/NGT/Index.h b/lib/NGT/Index.h index 6c831a0..6638a1d 100644 --- a/lib/NGT/Index.h +++ b/lib/NGT/Index.h @@ -112,6 +112,7 @@ namespace NGT { case DistanceType::DistanceTypeL1: p.set("DistanceType", "L1"); break; case DistanceType::DistanceTypeL2: p.set("DistanceType", "L2"); break; case DistanceType::DistanceTypeHamming: p.set("DistanceType", "Hamming"); break; + case DistanceType::DistanceTypeJaccard: p.set("DistanceType", "Jaccard"); break; case DistanceType::DistanceTypeAngle: p.set("DistanceType", "Angle"); break; case DistanceType::DistanceTypeCosine: p.set("DistanceType", "Cosine"); break; case DistanceType::DistanceTypeNormalizedAngle: p.set("DistanceType", "NormalizedAngle"); break; @@ -169,6 +170,8 @@ namespace NGT { distanceType = DistanceType::DistanceTypeL2; } else if (it->second == "Hamming") { distanceType = DistanceType::DistanceTypeHamming; + } else if (it->second == "Jaccard") { + distanceType = DistanceType::DistanceTypeJaccard; } else if (it->second == "Angle") { distanceType = DistanceType::DistanceTypeAngle; } else if (it->second == "Cosine") { diff --git a/lib/NGT/ObjectSpace.h b/lib/NGT/ObjectSpace.h index 9cb9f27..6157e31 100644 --- a/lib/NGT/ObjectSpace.h +++ b/lib/NGT/ObjectSpace.h @@ -171,7 +171,8 @@ namespace NGT { DistanceTypeAngle = 3, DistanceTypeCosine = 4, DistanceTypeNormalizedAngle = 5, - DistanceTypeNormalizedCosine = 6 + DistanceTypeNormalizedCosine = 6, + DistanceTypeJaccard = 7 }; enum ObjectType { diff --git a/lib/NGT/ObjectSpaceRepository.h b/lib/NGT/ObjectSpaceRepository.h index 278184f..ee1de02 100644 --- a/lib/NGT/ObjectSpaceRepository.h +++ b/lib/NGT/ObjectSpaceRepository.h @@ -93,6 +93,27 @@ namespace NGT { #endif }; + class ComparatorJaccardDistance : public Comparator { + public: +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + ComparatorJaccardDistance(size_t d, SharedMemoryAllocator &a) : Comparator(d, a) {} + double operator()(Object &objecta, Object &objectb) { + return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension); + } + double operator()(Object &objecta, PersistentObject &objectb) { + return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb.at(0, allocator), dimension); + } + double operator()(PersistentObject &objecta, PersistentObject &objectb) { + return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta.at(0, allocator), (OBJECT_TYPE*)&objectb.at(0, allocator), dimension); + } +#else + ComparatorJaccardDistance(size_t d) : Comparator(d) {} + double operator()(Object &objecta, Object &objectb) { + return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension); + } +#endif + }; + class ComparatorAngleDistance : public Comparator { public: #ifdef NGT_SHARED_MEMORY_ALLOCATOR @@ -278,6 +299,9 @@ namespace NGT { case DistanceTypeHamming: comparator = new ObjectSpaceRepository::ComparatorHammingDistance(ObjectSpace::getPaddedDimension()); break; + case DistanceTypeJaccard: + comparator = new ObjectSpaceRepository::ComparatorJaccardDistance(ObjectSpace::getPaddedDimension()); + break; case DistanceTypeAngle: comparator = new ObjectSpaceRepository::ComparatorAngleDistance(ObjectSpace::getPaddedDimension()); break; diff --git a/lib/NGT/PrimitiveComparator.h b/lib/NGT/PrimitiveComparator.h index d8ceacf..13b12a3 100644 --- a/lib/NGT/PrimitiveComparator.h +++ b/lib/NGT/PrimitiveComparator.h @@ -313,7 +313,6 @@ namespace NGT { } #endif - #if defined(NGT_COMPARATOR_NO_AVX) || !defined(__POPCNT__) inline static double popCount(uint32_t x) { x = (x & 0x55555555) + (x >> 1 & 0x55555555); @@ -326,41 +325,76 @@ namespace NGT { template inline static double compareHammingDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { - const OBJECT_TYPE *last = a + size; + const uint32_t *last = reinterpret_cast(a + size); - OBJECT_TYPE *uinta = (OBJECT_TYPE*)a; - OBJECT_TYPE *uintb = (OBJECT_TYPE*)b; + const uint32_t *uinta = reinterpret_cast(a); + const uint32_t *uintb = reinterpret_cast(b); size_t count = 0; - while( uinta < (OBJECT_TYPE*)last ){ - count += popCount(*(uint32_t*)uinta ^ *(uint32_t*)uintb); - uinta += 4; - uintb += 4; + while( uinta < last ){ + count += popCount(*uinta++ ^ *uintb++); } - return (double)count; + return static_cast(count); } #else template inline static double compareHammingDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { - const OBJECT_TYPE *last = a + size; + const uint64_t *last = reinterpret_cast(a + size); - uint64_t *uinta = (uint64_t*)a; - uint64_t *uintb = (uint64_t*)b; + const uint64_t *uinta = reinterpret_cast(a); + const uint64_t *uintb = reinterpret_cast(b); size_t count = 0; - while( uinta < (uint64_t*)last ){ + while( uinta < last ){ count += _mm_popcnt_u64(*uinta++ ^ *uintb++); count += _mm_popcnt_u64(*uinta++ ^ *uintb++); } - return (double)count; + return static_cast(count); + } +#endif + +#if defined(NGT_COMPARATOR_NO_AVX) || !defined(__POPCNT__) + template + inline static double compareJaccardDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { + const uint32_t *last = reinterpret_cast(a + size); + + const uint32_t *uinta = reinterpret_cast(a); + const uint32_t *uintb = reinterpret_cast(b); + size_t count = 0; + size_t countDe = 0; + while( uinta < last ){ + count += popCount(*uinta & *uintb); + countDe += popCount(*uinta++ | *uintb++); + count += popCount(*uinta & *uintb); + countDe += popCount(*uinta++ | *uintb++); + } + + return 1.0 - static_cast(count) / static_cast(countDe); + } +#else + template + inline static double compareJaccardDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { + const uint64_t *last = reinterpret_cast(a + size); + + const uint64_t *uinta = reinterpret_cast(a); + const uint64_t *uintb = reinterpret_cast(b); + size_t count = 0; + size_t countDe = 0; + while( uinta < last ){ + count += _mm_popcnt_u64(*uinta & *uintb); + countDe += _mm_popcnt_u64(*uinta++ | *uintb++); + count += _mm_popcnt_u64(*uinta & *uintb); + countDe += _mm_popcnt_u64(*uinta++ | *uintb++); + } + + return 1.0 - static_cast(count) / static_cast(countDe); } #endif - #if defined(NGT_COMPARATOR_NO_AVX) template inline static double compareDotProduct(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { - double sum = 0.0F; + double sum = 0.0; for (size_t loc = 0; loc < size; loc++) { sum += (double)a[loc] * (double)b[loc]; } @@ -369,9 +403,9 @@ namespace NGT { template inline static double compareCosine(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { - double normA = 0.0F; - double normB = 0.0F; - double sum = 0.0F; + double normA = 0.0; + double normB = 0.0; + double sum = 0.0; for (size_t loc = 0; loc < size; loc++) { normA += (double)a[loc] * (double)a[loc]; normB += (double)b[loc] * (double)b[loc]; @@ -432,7 +466,7 @@ namespace NGT { } inline static double compareDotProduct(const unsigned char *a, const unsigned char *b, size_t size) { - double sum = 0.0F; + double sum = 0.0; for (size_t loc = 0; loc < size; loc++) { sum += (double)a[loc] * (double)b[loc]; } @@ -479,9 +513,9 @@ namespace NGT { } inline static double compareCosine(const unsigned char *a, const unsigned char *b, size_t size) { - double normA = 0.0F; - double normB = 0.0F; - double sum = 0.0F; + double normA = 0.0; + double normB = 0.0; + double sum = 0.0; for (size_t loc = 0; loc < size; loc++) { normA += (double)a[loc] * (double)a[loc]; normB += (double)b[loc] * (double)b[loc]; @@ -497,10 +531,10 @@ namespace NGT { template inline static double compareAngleDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { double cosine = compareCosine(a, b, size); - if (cosine >= 1.0F) { - return 0.0F; - } else if (cosine <= -1.0F) { - return acos(-1.0F); + if (cosine >= 1.0) { + return 0.0; + } else if (cosine <= -1.0) { + return acos(-1.0); } else { return acos(cosine); } @@ -509,10 +543,10 @@ namespace NGT { template inline static double compareNormalizedAngleDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { double cosine = compareDotProduct(a, b, size); - if (cosine >= 1.0F) { - return 0.0F; - } else if (cosine <= -1.0F) { - return acos(-1.0F); + if (cosine >= 1.0) { + return 0.0; + } else if (cosine <= -1.0) { + return acos(-1.0); } else { return acos(cosine); } @@ -550,6 +584,13 @@ namespace NGT { } }; + class JaccardUint8 { + public: + inline static double compare(const void *a, const void *b, size_t size) { + return PrimitiveComparator::compareJaccardDistance((const uint8_t*)a, (const uint8_t*)b, size); + } + }; + class L2Float { public: inline static double compare(const void *a, const void *b, size_t size) { diff --git a/python/README-ngtpy-jp.md b/python/README-ngtpy-jp.md index 40e1de8..4b9784f 100644 --- a/python/README-ngtpy-jp.md +++ b/python/README-ngtpy-jp.md @@ -152,6 +152,7 @@ FUNCTIONS - __Cosine__: コサイン類似度 - __Normalized Cosine__: 正規化コサイン類似度。指定されたデータは自動的に正規化された上でインデックスに登録されます。 - __Hamming__: ハミング距離 +- __Jaccard__: ジャッカード距離 **object\_type** オブジェクトのデータタイプを指定します。 diff --git a/python/README-ngtpy.md b/python/README-ngtpy.md index e595abd..70cd61a 100644 --- a/python/README-ngtpy.md +++ b/python/README-ngtpy.md @@ -154,6 +154,7 @@ Specifies the distance function for the objects. - __Cosine__: Cosine similarity - __Normalized Cosine__: Normalized cosine similarity. The specified data are automatically normalized to be appended to the index. - __Hamming__: Hamming distance +- __Jaccard__: Jaccard distance **object\_type** Specifies the data type of the objects. diff --git a/python/ngt/README.md b/python/ngt/README.md index d22bbf9..0a69fcb 100644 --- a/python/ngt/README.md +++ b/python/ngt/README.md @@ -124,7 +124,7 @@ create an empty index with the specified parameters. edge_size_for_creation : Number of edges for each node in the graph. edge_size_for_search : Number of edges to search. object_type : Type of the data object. (Float, Integer [Integer is 1 byte]) - distance_type : Type of the distance function. (L1,L2,Angle,Hamming) + distance_type : Type of the distance function. (L1,L2,Angle,Hamming,Jaccard)

save

diff --git a/python/ngt/base.py b/python/ngt/base.py index 9d68757..462af79 100644 --- a/python/ngt/base.py +++ b/python/ngt/base.py @@ -128,6 +128,8 @@ def __repr__(self): __ngt.ngt_set_property_distance_type_hamming.argtypes = [c_void_p, c_void_p] + __ngt.ngt_set_property_distance_type_jaccard.argtypes = [c_void_p, c_void_p] + __ngt.ngt_set_property_distance_type_cosine.argtypes = [c_void_p, c_void_p] __ngt.ngt_create_empty_results.argtype = [c_void_p] @@ -249,6 +251,10 @@ def create(path, dimension, stat = Index.__ngt.ngt_set_property_distance_type_hamming( prop, err) Index._check_error_num(stat, err) + elif distance_type == "Jaccard": + stat = Index.__ngt.ngt_set_property_distance_type_jaccard( + prop, err) + Index._check_error_num(stat, err) elif distance_type == "Cosine": stat = Index.__ngt.ngt_set_property_distance_type_cosine( prop, err) diff --git a/python/setup.py b/python/setup.py index 72e8cbd..f36c645 100644 --- a/python/setup.py +++ b/python/setup.py @@ -21,7 +21,7 @@ # for pip >= 10.0 from pip._internal import locations -version = '1.3.1' +version = '1.4.0' if static_library: with open('../VERSION', 'r') as fh: @@ -43,7 +43,7 @@ 'long_description_content_type': 'text/markdown', 'license': 'Apache License Version 2.0', 'packages': ['ngt'], - 'install_requires': ['numpy'] + 'install_requires': ['numpy', 'pybind11'] } if sys.version_info.major >= 3: diff --git a/python/src/ngtpy.cpp b/python/src/ngtpy.cpp index bdd263a..ea79184 100644 --- a/python/src/ngtpy.cpp +++ b/python/src/ngtpy.cpp @@ -64,6 +64,8 @@ class Index : public NGT::Index { prop.distanceType = NGT::Property::DistanceType::DistanceTypeL2; } else if (distanceType == "Hamming") { prop.distanceType = NGT::Property::DistanceType::DistanceTypeHamming; + } else if (distanceType == "Jaccard") { + prop.distanceType = NGT::Property::DistanceType::DistanceTypeJaccard; } else if (distanceType == "Angle") { prop.distanceType = NGT::Property::DistanceType::DistanceTypeAngle; } else if (distanceType == "Normalized Angle") { @@ -184,6 +186,68 @@ class Index : public NGT::Index { return results; } + py::object linearSearch( + py::object query, + size_t size = 10, // the number of resultant objects + bool withDistance = true + ) { + py::array_t qobject(query); + py::buffer_info qinfo = qobject.request(); + NGT::Object *ngtquery = 0; + try { + ngtquery = NGT::Index::allocateObject(static_cast(qinfo.ptr), qinfo.size); + } catch (NGT::Exception &e) { + std::cerr << e.what() << endl; + if (!withDistance) { + return py::array_t(); + } else { + return py::list(); + } + } + + NGT::SearchContainer sc(*ngtquery); + sc.setSize(size); // the number of resultant objects. + + NGT::Index::linearSearch(sc); + + numOfDistanceComputations += sc.distanceComputationCount; + + NGT::Index::deleteObject(ngtquery); + if (!withDistance) { + NGT::ResultPriorityQueue &r = sc.getWorkingResult(); + py::array_t ids(r.size()); + py::buffer_info idsinfo = ids.request(); + int *endptr = reinterpret_cast(idsinfo.ptr); + int *ptr = endptr + (r.size() - 1); + if (zeroNumbering) { + while (ptr >= endptr) { + *ptr-- = r.top().id - 1; + r.pop(); + } + } else { + while (ptr >= endptr) { + *ptr-- = r.top().id; + r.pop(); + } + } + + return ids; + } + py::list results; + NGT::ObjectDistances r; + r.moveFrom(sc.getWorkingResult()); + if (zeroNumbering) { + for (auto ri = r.begin(); ri != r.end(); ++ri) { + results.append(py::make_tuple((*ri).id - 1, (*ri).distance)); + } + } else { + for (auto ri = r.begin(); ri != r.end(); ++ri) { + results.append(py::make_tuple((*ri).id, (*ri).distance)); + } + } + return results; + } + void remove(size_t id) { id = zeroNumbering ? id + 1 : id; NGT::Index::remove(id); @@ -246,6 +310,10 @@ PYBIND11_MODULE(ngtpy, m) { py::arg("epsilon") = 0.1, py::arg("edge_size") = -1, py::arg("with_distance") = true) + .def("linear_search", &::Index::linearSearch, + py::arg("query"), + py::arg("size") = 10, + py::arg("with_distance") = true) .def("get_num_of_distance_computations", &::Index::getNumOfDistanceComputations) .def("save", &NGT::Index::save) .def("close", &NGT::Index::close)