Skip to content

Commit

Permalink
Implement Rare Terms aggregation (#4054)
Browse files Browse the repository at this point in the history
Relates: #4001

This commit implements the Rare terms aggregations. Rare term buckets only expose key and doc_count, so a new RareTermsBucket<TKey> type is used.

(cherry picked from commit 5c43f5b)
  • Loading branch information
russcam committed Aug 30, 2019
1 parent f93d5d1 commit 2fbd3d8
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 0 deletions.
26 changes: 26 additions & 0 deletions src/Nest/Aggregations/AggregateDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,20 @@ public TermsAggregate<TKey> Terms<TKey>(string key)

public MultiBucketAggregate<KeyedBucket<string>> AdjacencyMatrix(string key) => GetMultiKeyedBucketAggregate<string>(key);

public MultiBucketAggregate<RareTermsBucket<TKey>> RareTerms<TKey>(string key)
{
var bucket = TryGet<BucketAggregate>(key);
return bucket == null
? null
: new MultiBucketAggregate<RareTermsBucket<TKey>>
{
Buckets = GetRareTermsBuckets<TKey>(bucket.Items).ToList(),
Meta = bucket.Meta
};
}

public MultiBucketAggregate<RareTermsBucket<string>> RareTerms(string key) => RareTerms<string>(key);

public MultiBucketAggregate<RangeBucket> Range(string key) => GetMultiBucketAggregate<RangeBucket>(key);

public MultiBucketAggregate<RangeBucket> DateRange(string key) => GetMultiBucketAggregate<RangeBucket>(key);
Expand Down Expand Up @@ -275,5 +289,17 @@ private IEnumerable<SignificantTermsBucket<TKey>> GetSignificantTermsBuckets<TKe
Score = bucket.Score
};
}

private IEnumerable<RareTermsBucket<TKey>> GetRareTermsBuckets<TKey>(IEnumerable<IBucket> items)
{
var buckets = items.Cast<KeyedBucket<object>>();

foreach (var bucket in buckets)
yield return new RareTermsBucket<TKey>(bucket.BackingDictionary)
{
Key = (TKey)Convert.ChangeType(bucket.Key, typeof(TKey)),
DocCount = bucket.DocCount.GetValueOrDefault(0)
};
}
}
}
12 changes: 12 additions & 0 deletions src/Nest/Aggregations/AggregationContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ public interface IAggregationContainer
[DataMember(Name = "range")]
IRangeAggregation Range { get; set; }

[DataMember(Name = "rare_terms")]
IRareTermsAggregation RareTerms { get; set; }

[DataMember(Name = "reverse_nested")]
IReverseNestedAggregation ReverseNested { get; set; }

Expand Down Expand Up @@ -339,6 +342,8 @@ public class AggregationContainer : IAggregationContainer

public IRangeAggregation Range { get; set; }

public IRareTermsAggregation RareTerms { get; set; }

public IReverseNestedAggregation ReverseNested { get; set; }

public ISamplerAggregation Sampler { get; set; }
Expand Down Expand Up @@ -481,6 +486,8 @@ public class AggregationContainerDescriptor<T> : DescriptorBase<AggregationConta

IRangeAggregation IAggregationContainer.Range { get; set; }

IRareTermsAggregation IAggregationContainer.RareTerms { get; set; }

IReverseNestedAggregation IAggregationContainer.ReverseNested { get; set; }

ISamplerAggregation IAggregationContainer.Sampler { get; set; }
Expand Down Expand Up @@ -638,6 +645,11 @@ Func<RangeAggregationDescriptor<T>, IRangeAggregation> selector
) =>
_SetInnerAggregation(name, selector, (a, d) => a.Range = d);

public AggregationContainerDescriptor<T> RareTerms(string name,
Func<RareTermsAggregationDescriptor<T>, IRareTermsAggregation> selector
) =>
_SetInnerAggregation(name, selector, (a, d) => a.RareTerms = d);

public AggregationContainerDescriptor<T> Stats(string name,
Func<StatsAggregationDescriptor<T>, IStatsAggregation> selector
) =>
Expand Down
127 changes: 127 additions & 0 deletions src/Nest/Aggregations/Bucket/RareTerms/RareTermsAggregation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
using System;
using System.Collections.Generic;
using System.Linq.Expressions;
using System.Runtime.Serialization;
using Elasticsearch.Net.Utf8Json;

namespace Nest
{
/// <summary>
/// A multi-bucket value source based aggregation which finds "rare" terms — terms that are at the long-tail of the distribution
/// and are not frequent. Conceptually, this is like a terms aggregation that is sorted by _count ascending.
/// </summary>
[InterfaceDataContract]
[ReadAs(typeof(RareTermsAggregation))]
public interface IRareTermsAggregation : IBucketAggregation
{
/// <summary>
/// Terms that should be excluded from the aggregation
/// </summary>
[DataMember(Name = "exclude")]
TermsExclude Exclude { get; set; }

/// <summary>
/// The field to find rare terms in
/// </summary>
[DataMember(Name = "field")]
Field Field { get; set; }

/// <summary>
/// Terms that should be included in the aggregation
/// </summary>
[DataMember(Name = "include")]
TermsInclude Include { get; set; }

/// <summary>
/// The maximum number of documents a term should appear in.
/// Defaults to <c>1</c>
/// </summary>
[DataMember(Name = "max_doc_count")]
long? MaximumDocumentCount { get; set; }

/// <summary>
/// The value that should be used if a document does not have the field being aggregated
/// </summary>
[DataMember(Name = "missing")]
object Missing { get; set; }

/// <summary>
/// The precision of the internal CuckooFilters. Smaller precision leads to better approximation,
/// but higher memory usage. Cannot be smaller than 0.00001. Defaults to 0.01
/// </summary>
[DataMember(Name = "precision")]
double? Precision { get; set; }
}

/// <inheritdoc cref="IRareTermsAggregation"/>
public class RareTermsAggregation : BucketAggregationBase, IRareTermsAggregation
{
internal RareTermsAggregation() { }

public RareTermsAggregation(string name) : base(name) { }

/// <inheritdoc />
public TermsExclude Exclude { get; set; }
/// <inheritdoc />
public Field Field { get; set; }
/// <inheritdoc />
public TermsInclude Include { get; set; }
/// <inheritdoc />
public long? MaximumDocumentCount { get; set; }
/// <inheritdoc />
public object Missing { get; set; }
/// <inheritdoc />
public double? Precision { get; set; }

internal override void WrapInContainer(AggregationContainer c) => c.RareTerms = this;
}

/// <inheritdoc cref="IRareTermsAggregation"/>
public class RareTermsAggregationDescriptor<T>
: BucketAggregationDescriptorBase<RareTermsAggregationDescriptor<T>, IRareTermsAggregation, T>, IRareTermsAggregation
where T : class
{
TermsExclude IRareTermsAggregation.Exclude { get; set; }
Field IRareTermsAggregation.Field { get; set; }
TermsInclude IRareTermsAggregation.Include { get; set; }
long? IRareTermsAggregation.MaximumDocumentCount { get; set; }
object IRareTermsAggregation.Missing { get; set; }
double? IRareTermsAggregation.Precision { get; set; }

/// <inheritdoc cref="IRareTermsAggregation.Field" />
public RareTermsAggregationDescriptor<T> Field(Field field) => Assign(field, (a, v) => a.Field = v);

/// <inheritdoc cref="IRareTermsAggregation.Field" />
public RareTermsAggregationDescriptor<T> Field<TValue>(Expression<Func<T, TValue>> field) => Assign(field, (a, v) => a.Field = v);

/// <inheritdoc cref="IRareTermsAggregation.MaximumDocumentCount" />
public RareTermsAggregationDescriptor<T> MaximumDocumentCount(long? maximumDocumentCount) =>
Assign(maximumDocumentCount, (a, v) => a.MaximumDocumentCount = v);

/// <inheritdoc cref="IRareTermsAggregation.Include" />
public RareTermsAggregationDescriptor<T> Include(long partition, long numberOfPartitions) =>
Assign(new TermsInclude(partition, numberOfPartitions), (a, v) => a.Include = v);

/// <inheritdoc cref="IRareTermsAggregation.Include" />
public RareTermsAggregationDescriptor<T> Include(string includePattern) =>
Assign(new TermsInclude(includePattern), (a, v) => a.Include = v);

/// <inheritdoc cref="IRareTermsAggregation.Include" />
public RareTermsAggregationDescriptor<T> Include(IEnumerable<string> values) =>
Assign(new TermsInclude(values), (a, v) => a.Include = v);

/// <inheritdoc cref="IRareTermsAggregation.Exclude" />
public RareTermsAggregationDescriptor<T> Exclude(string excludePattern) =>
Assign(new TermsExclude(excludePattern), (a, v) => a.Exclude = v);

/// <inheritdoc cref="IRareTermsAggregation.Exclude" />
public RareTermsAggregationDescriptor<T> Exclude(IEnumerable<string> values) =>
Assign(new TermsExclude(values), (a, v) => a.Exclude = v);

/// <inheritdoc cref="IRareTermsAggregation.Missing" />
public RareTermsAggregationDescriptor<T> Missing(object missing) => Assign(missing, (a, v) => a.Missing = v);

/// <inheritdoc cref="IRareTermsAggregation.Precision" />
public RareTermsAggregationDescriptor<T> Precision(double? precision) => Assign(precision, (a, v) => a.Precision = v);
}
}
13 changes: 13 additions & 0 deletions src/Nest/Aggregations/Bucket/RareTerms/RareTermsBucket.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using System.Collections.Generic;

namespace Nest
{
public class RareTermsBucket<TKey> : BucketBase
{
public RareTermsBucket(IReadOnlyDictionary<string, IAggregate> dict) : base(dict) { }

public long DocCount { get; set; }

public TKey Key { get; set; }
}
}
4 changes: 4 additions & 0 deletions src/Nest/Aggregations/Visitor/AggregationVisitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ public interface IAggregationVisitor

void Visit(IRangeAggregation aggregation);

void Visit(IRareTermsAggregation aggregation);

void Visit(ITermsAggregation aggregation);

void Visit(ISignificantTermsAggregation aggregation);
Expand Down Expand Up @@ -201,6 +203,8 @@ public virtual void Visit(ISignificantTermsAggregation aggregation) { }

public virtual void Visit(IRangeAggregation aggregation) { }

public virtual void Visit(IRareTermsAggregation aggregation) { }

public virtual void Visit(INestedAggregation aggregation) { }

public virtual void Visit(IParentAggregation aggregation) { }
Expand Down
5 changes: 5 additions & 0 deletions src/Nest/Aggregations/Visitor/AggregationWalker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ public void Walk(IAggregationContainer aggregation, IAggregationVisitor visitor)
v.Visit(d);
Accept(v, d.Aggregations);
});
AcceptAggregation(aggregation.RareTerms, visitor, (v, d) =>
{
v.Visit(d);
Accept(v, d.Aggregations);
});
AcceptAggregation(aggregation.ReverseNested, visitor, (v, d) =>
{
v.Visit(d);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
using System;
using System.Collections.Generic;
using FluentAssertions;
using Nest;
using Tests.Core.Extensions;
using Tests.Core.ManagedElasticsearch.Clusters;
using Tests.Domain;
using Tests.Framework.EndpointTests.TestState;

namespace Tests.Aggregations.Bucket.RareTerms
{
/**
* A multi-bucket value source based aggregation which finds "rare" terms — terms that are at the long-tail of the
* distribution and are not frequent. Conceptually, this is like a terms aggregation that is sorted by _count ascending.
* As noted in the terms aggregation docs, actually ordering a terms agg by count ascending has unbounded error.
* Instead, you should use the rare_terms aggregation
*
* See the Elasticsearch documentation on {ref_current}/search-aggregations-bucket-rare-terms-aggregation.html[rare terms aggregation] for more detail.
*/
public class RareTermsAggregationUsageTests : AggregationUsageTestBase
{
public RareTermsAggregationUsageTests(ReadOnlyCluster i, EndpointUsage usage) : base(i, usage) { }

protected override object AggregationJson => new
{
names = new
{
meta = new
{
foo = "bar"
},
rare_terms = new
{
field = "name",
max_doc_count = 5,
missing = "n/a",
precision = 0.001
}
}
};

protected override Func<AggregationContainerDescriptor<Project>, IAggregationContainer> FluentAggs => a => a
.RareTerms("names", st => st
.Field(p => p.Name)
.Missing("n/a")
.MaximumDocumentCount(5)
.Precision(0.001)
.Meta(m => m
.Add("foo", "bar")
)
);

protected override AggregationDictionary InitializerAggs =>
new RareTermsAggregation("names")
{
Field = Infer.Field<Project>(p => p.Name),
MaximumDocumentCount = 5,
Precision = 0.001,
Missing = "n/a",
Meta = new Dictionary<string, object> { { "foo", "bar" } }
};

protected override void ExpectResponse(ISearchResponse<Project> response)
{
response.ShouldBeValid();
var rareTerms = response.Aggregations.RareTerms("names");
rareTerms.Should().NotBeNull();
rareTerms.Buckets.Should().NotBeNull();
rareTerms.Buckets.Count.Should().BeGreaterThan(0);
foreach (var item in rareTerms.Buckets)
{
item.Key.Should().NotBeNullOrEmpty();
item.DocCount.Should().BeGreaterOrEqualTo(1);
}
rareTerms.Meta.Should().NotBeNull().And.HaveCount(1);
rareTerms.Meta["foo"].Should().Be("bar");
}
}
}

0 comments on commit 2fbd3d8

Please sign in to comment.