This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Azure Search] Add custom analyzers (#485)
Adds the following custom analyzers: 1. The `ExactMatchCustomAnalyzer` that allows for case insensitive exact matches. This is used for the `packageId`, `version`, `owner`, and `owners` query fields. 2. The `PackageIdCustomAnalyzer` that splits on non alpha-numeric characters and camel casing. This is used by the `id` query field. This corresponds to the existing [`IdentifierAnalyzer`](https://github.com/NuGet/NuGet.Services.Metadata/blob/master/src/NuGet.Indexing/IdentifierAnalyzer.cs) 3. The `DescriptionCustomAnalyzer` that splits on non alpha-numeric characters and camel casing and removes stopwords (like "the" or "and"). This is used by the `title`, `description`, `author`, `authors`, and `summary` query fields. This corresponds to the existing [`DescriptionAnalyzer`](https://github.com/NuGet/NuGet.Services.Metadata/blob/master/src/NuGet.Indexing/DescriptionAnalyzer.cs) Addresses NuGet/NuGetGallery#6920 and NuGet/NuGetGallery#6922 Also, see functional tests: NuGet/NuGet.Services.Metadata#487
- Loading branch information
1 parent
4a91dfe
commit 7d60dc5
Showing
17 changed files
with
182 additions
and
26 deletions.
There are no files selected for viewing
28 changes: 28 additions & 0 deletions
28
src/NuGet.Services.AzureSearch/Analysis/DescriptionCustomAnalyzer.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.Azure.Search.Models; | ||
|
||
namespace NuGet.Services.AzureSearch | ||
{ | ||
/// <summary> | ||
/// Support for NuGet style description analysis. This splits tokens | ||
/// on non alpha-numeric characters, splits tokens on camel casing, | ||
/// lower cases tokens, and then removes stopwords from tokens. | ||
/// </summary> | ||
public static class DescriptionAnalyzer | ||
{ | ||
public const string Name = "nuget_description_analyzer"; | ||
|
||
public static readonly CustomAnalyzer Instance = new CustomAnalyzer( | ||
Name, | ||
PackageIdCustomTokenizer.Name, | ||
new List<TokenFilterName> | ||
{ | ||
IdentifierCustomTokenFilter.Name, | ||
TokenFilterName.Stopwords, | ||
TokenFilterName.Lowercase, | ||
}); | ||
} | ||
} |
25 changes: 25 additions & 0 deletions
25
src/NuGet.Services.AzureSearch/Analysis/ExactMatchCustomAnalyzer.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.Azure.Search.Models; | ||
|
||
namespace NuGet.Services.AzureSearch | ||
{ | ||
/// <summary> | ||
/// Support for case-insensitive exact matching on a field | ||
/// in an Azure Search index. | ||
/// </summary> | ||
public static class ExactMatchCustomAnalyzer | ||
{ | ||
public const string Name = "nuget_exact_match_analyzer"; | ||
|
||
public static readonly CustomAnalyzer Instance = new CustomAnalyzer( | ||
Name, | ||
TokenizerName.Keyword, | ||
new List<TokenFilterName> | ||
{ | ||
TokenFilterName.Lowercase | ||
}); | ||
} | ||
} |
22 changes: 22 additions & 0 deletions
22
src/NuGet.Services.AzureSearch/Analysis/IdentifierCustomTokenFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using Microsoft.Azure.Search.Models; | ||
|
||
namespace NuGet.Services.AzureSearch | ||
{ | ||
/// <summary> | ||
/// Splits tokens on camel casing and non alpha-numeric characters. | ||
/// This does not consume the original token. For example, "Foo2Bar.Baz" | ||
/// becomes "Foo", "2", "Bar", "Baz", and "Foo2Bar.Baz". | ||
/// </summary> | ||
public static class IdentifierCustomTokenFilter | ||
{ | ||
public const string Name = "nuget_id_filter"; | ||
|
||
public static WordDelimiterTokenFilter Instance = new WordDelimiterTokenFilter( | ||
Name, | ||
splitOnCaseChange: true, | ||
preserveOriginal: true); | ||
} | ||
} |
27 changes: 27 additions & 0 deletions
27
src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomAnalyzer.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.Azure.Search.Models; | ||
|
||
namespace NuGet.Services.AzureSearch | ||
{ | ||
/// <summary> | ||
/// Support for NuGet style identifier analysis. Splits tokens | ||
/// on non alpha-numeric characters and camel casing, and lower | ||
/// cases tokens. | ||
/// </summary> | ||
public static class PackageIdCustomAnalyzer | ||
{ | ||
public const string Name = "nuget_package_id_analyzer"; | ||
|
||
public static readonly CustomAnalyzer Instance = new CustomAnalyzer( | ||
Name, | ||
PackageIdCustomTokenizer.Name, | ||
new List<TokenFilterName> | ||
{ | ||
IdentifierCustomTokenFilter.Name, | ||
TokenFilterName.Lowercase, | ||
}); | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomTokenizer.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
using Microsoft.Azure.Search.Models; | ||
|
||
namespace NuGet.Services.AzureSearch | ||
{ | ||
/// <summary> | ||
/// Splits tokens that on a set of symbols. | ||
/// For example, "Foo.Bar" becomes "Foo" and "Bar". | ||
/// </summary> | ||
public static class PackageIdCustomTokenizer | ||
{ | ||
public const string Name = "nuget_package_id_tokenizer"; | ||
|
||
public static readonly PatternTokenizer Instance = new PatternTokenizer( | ||
Name, | ||
@"[.\-_,;:'*#!~+()\[\]{}]"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters