Skip to content

Commit

Permalink
Add max_token_length to chargroup tokenizer (#4911) (#4927)
Browse files Browse the repository at this point in the history
Relates: elastic/elasticsearch#56860

Co-authored-by: Russ Cam <[email protected]>
  • Loading branch information
github-actions[bot] and russcam authored Aug 4, 2020
1 parent 2404113 commit 5ab0732
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
23 changes: 21 additions & 2 deletions src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Collections.Generic;
using System.Collections.Generic;
using System.Runtime.Serialization;
using Elasticsearch.Net.Utf8Json;

namespace Nest
{
Expand All @@ -20,6 +21,16 @@ public interface ICharGroupTokenizer : ITokenizer
/// </summary>
[DataMember(Name ="tokenize_on_chars")]
IEnumerable<string> TokenizeOnCharacters { get; set; }

/// <summary>
/// The maximum token length. If a token is seen that exceeds this length then
/// it is split at <see cref="MaxTokenLength"/> intervals. Defaults to `255`.
/// <para />
/// Valid in Elasticsearch 7.9.0+
/// </summary>
[DataMember(Name = "max_token_length")]
[JsonFormatter(typeof(NullableStringIntFormatter))]
int? MaxTokenLength { get; set; }
}

/// <inheritdoc cref="ICharGroupTokenizer" />
Expand All @@ -31,6 +42,9 @@ public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
public IEnumerable<string> TokenizeOnCharacters { get; set; }

/// <inheritdoc cref="ICharGroupTokenizer.MaxTokenLength" />
public int? MaxTokenLength { get; set; }
}

/// <inheritdoc cref="ICharGroupTokenizer" />
Expand All @@ -40,6 +54,7 @@ public class CharGroupTokenizerDescriptor
protected override string Type => CharGroupTokenizer.TokenizerType;

IEnumerable<string> ICharGroupTokenizer.TokenizeOnCharacters { get; set; }
int? ICharGroupTokenizer.MaxTokenLength { get; set; }

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) =>
Expand All @@ -48,5 +63,9 @@ public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] charact
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable<string> characters) =>
Assign(characters, (a, v) => a.TokenizeOnCharacters = v);

/// <inheritdoc cref="ICharGroupTokenizer.MaxTokenLength" />
public CharGroupTokenizerDescriptor MaxTokenLength(int? maxTokenLength) =>
Assign(maxTokenLength, (a, v) => a.MaxTokenLength = v);
}
}
26 changes: 26 additions & 0 deletions tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,32 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
public override string Name => "char_group";
}

[SkipVersion("<7.9.0", "max_token_length introduced in 7.9.0")]
public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase<CharGroupMaxTokenLengthTests>
{
private readonly string[] _chars = { "whitespace", "-", "\n" };

public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e
.TokenizeOnCharacters(_chars)
.MaxTokenLength(255)
);

public override ITokenizer Initializer => new CharGroupTokenizer
{
TokenizeOnCharacters = _chars,
MaxTokenLength = 255
};

public override object Json => new
{
tokenize_on_chars = _chars,
type = "char_group",
max_token_length = 255
};

public override string Name => "char_group_max_token_length";
}

[SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")]
public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuationTests>
{
Expand Down

0 comments on commit 5ab0732

Please sign in to comment.