From d3e6b7a3aa8ec9cb8b32bf860f2d4f0d6766ed92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Sharma?= Date: Fri, 1 Mar 2019 11:02:05 -0800 Subject: [PATCH] [Azure Search] Add custom analyzers (#485) Adds the following custom analyzers: 1. The `ExactMatchCustomAnalyzer` that allows for case insensitive exact matches. This is used for the `packageId`, `version`, `owner`, and `owners` query fields. 2. The `PackageIdCustomAnalyzer` that splits on non alpha-numeric characters and camel casing. This is used by the `id` query field. This corresponds to the existing [`IdentifierAnalyzer`](https://github.com/NuGet/NuGet.Services.Metadata/blob/master/src/NuGet.Indexing/IdentifierAnalyzer.cs) 3. The `DescriptionCustomAnalyzer` that splits on non alpha-numeric characters and camel casing and removes stopwords (like "the" or "and"). This is used by the `title`, `description`, `author`, `authors`, and `summary` query fields. This corresponds to the existing [`DescriptionAnalyzer`](https://github.com/NuGet/NuGet.Services.Metadata/blob/master/src/NuGet.Indexing/DescriptionAnalyzer.cs) Addresses https://github.com/nuget/nugetgallery/issues/6920 and https://github.com/nuget/nugetgallery/issues/6922 Also, see functional tests: https://github.com/NuGet/NuGet.Services.Metadata/pull/487 --- .../Analysis/DescriptionCustomAnalyzer.cs | 28 ++++++++++++++++ .../Analysis/ExactMatchCustomAnalyzer.cs | 25 +++++++++++++++ .../Analysis/IdentifierCustomTokenFilter.cs | 22 +++++++++++++ .../Analysis/PackageIdCustomAnalyzer.cs | 27 ++++++++++++++++ .../Analysis/PackageIdCustomTokenizer.cs | 17 ++++++++++ .../NewPackageRegistrationProducer.cs | 2 +- .../DocumentUtilities.cs | 2 ++ .../IndexBuilder.cs | 32 +++++++++++++++---- .../Models/BaseMetadataDocument.cs | 16 ++++++++++ .../Models/IBaseMetadataDocument.cs | 1 + .../Models/SearchDocument.cs | 1 + .../NuGet.Services.AzureSearch.csproj | 5 +++ .../SearchService/IndexFields.cs | 1 + .../SearchService/SearchTextBuilder.cs | 2 +- .../HijackDocumentBuilderFacts.cs | 2 ++ .../SearchDocumentBuilderFacts.cs | 2 ++ .../SearchService/SearchTextBuilderFacts.cs | 23 ++++--------- 17 files changed, 182 insertions(+), 26 deletions(-) create mode 100644 src/NuGet.Services.AzureSearch/Analysis/DescriptionCustomAnalyzer.cs create mode 100644 src/NuGet.Services.AzureSearch/Analysis/ExactMatchCustomAnalyzer.cs create mode 100644 src/NuGet.Services.AzureSearch/Analysis/IdentifierCustomTokenFilter.cs create mode 100644 src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomAnalyzer.cs create mode 100644 src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomTokenizer.cs diff --git a/src/NuGet.Services.AzureSearch/Analysis/DescriptionCustomAnalyzer.cs b/src/NuGet.Services.AzureSearch/Analysis/DescriptionCustomAnalyzer.cs new file mode 100644 index 000000000..b9abcb694 --- /dev/null +++ b/src/NuGet.Services.AzureSearch/Analysis/DescriptionCustomAnalyzer.cs @@ -0,0 +1,28 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System.Collections.Generic; +using Microsoft.Azure.Search.Models; + +namespace NuGet.Services.AzureSearch +{ + /// + /// Support for NuGet style description analysis. This splits tokens + /// on non alpha-numeric characters, splits tokens on camel casing, + /// lower cases tokens, and then removes stopwords from tokens. + /// + public static class DescriptionAnalyzer + { + public const string Name = "nuget_description_analyzer"; + + public static readonly CustomAnalyzer Instance = new CustomAnalyzer( + Name, + PackageIdCustomTokenizer.Name, + new List + { + IdentifierCustomTokenFilter.Name, + TokenFilterName.Stopwords, + TokenFilterName.Lowercase, + }); + } +} diff --git a/src/NuGet.Services.AzureSearch/Analysis/ExactMatchCustomAnalyzer.cs b/src/NuGet.Services.AzureSearch/Analysis/ExactMatchCustomAnalyzer.cs new file mode 100644 index 000000000..bcd7e4c20 --- /dev/null +++ b/src/NuGet.Services.AzureSearch/Analysis/ExactMatchCustomAnalyzer.cs @@ -0,0 +1,25 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System.Collections.Generic; +using Microsoft.Azure.Search.Models; + +namespace NuGet.Services.AzureSearch +{ + /// + /// Support for case-insensitive exact matching on a field + /// in an Azure Search index. + /// + public static class ExactMatchCustomAnalyzer + { + public const string Name = "nuget_exact_match_analyzer"; + + public static readonly CustomAnalyzer Instance = new CustomAnalyzer( + Name, + TokenizerName.Keyword, + new List + { + TokenFilterName.Lowercase + }); + } +} diff --git a/src/NuGet.Services.AzureSearch/Analysis/IdentifierCustomTokenFilter.cs b/src/NuGet.Services.AzureSearch/Analysis/IdentifierCustomTokenFilter.cs new file mode 100644 index 000000000..b21d784ab --- /dev/null +++ b/src/NuGet.Services.AzureSearch/Analysis/IdentifierCustomTokenFilter.cs @@ -0,0 +1,22 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using Microsoft.Azure.Search.Models; + +namespace NuGet.Services.AzureSearch +{ + /// + /// Splits tokens on camel casing and non alpha-numeric characters. + /// This does not consume the original token. For example, "Foo2Bar.Baz" + /// becomes "Foo", "2", "Bar", "Baz", and "Foo2Bar.Baz". + /// + public static class IdentifierCustomTokenFilter + { + public const string Name = "nuget_id_filter"; + + public static WordDelimiterTokenFilter Instance = new WordDelimiterTokenFilter( + Name, + splitOnCaseChange: true, + preserveOriginal: true); + } +} diff --git a/src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomAnalyzer.cs b/src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomAnalyzer.cs new file mode 100644 index 000000000..fc10f10ec --- /dev/null +++ b/src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomAnalyzer.cs @@ -0,0 +1,27 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System.Collections.Generic; +using Microsoft.Azure.Search.Models; + +namespace NuGet.Services.AzureSearch +{ + /// + /// Support for NuGet style identifier analysis. Splits tokens + /// on non alpha-numeric characters and camel casing, and lower + /// cases tokens. + /// + public static class PackageIdCustomAnalyzer + { + public const string Name = "nuget_package_id_analyzer"; + + public static readonly CustomAnalyzer Instance = new CustomAnalyzer( + Name, + PackageIdCustomTokenizer.Name, + new List + { + IdentifierCustomTokenFilter.Name, + TokenFilterName.Lowercase, + }); + } +} diff --git a/src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomTokenizer.cs b/src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomTokenizer.cs new file mode 100644 index 000000000..ef1f8b739 --- /dev/null +++ b/src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomTokenizer.cs @@ -0,0 +1,17 @@ +using Microsoft.Azure.Search.Models; + +namespace NuGet.Services.AzureSearch +{ + /// + /// Splits tokens that on a set of symbols. + /// For example, "Foo.Bar" becomes "Foo" and "Bar". + /// + public static class PackageIdCustomTokenizer + { + public const string Name = "nuget_package_id_tokenizer"; + + public static readonly PatternTokenizer Instance = new PatternTokenizer( + Name, + @"[.\-_,;:'*#!~+()\[\]{}]"); + } +} diff --git a/src/NuGet.Services.AzureSearch/Db2AzureSearch/NewPackageRegistrationProducer.cs b/src/NuGet.Services.AzureSearch/Db2AzureSearch/NewPackageRegistrationProducer.cs index 62af7fdcb..3f9a29372 100644 --- a/src/NuGet.Services.AzureSearch/Db2AzureSearch/NewPackageRegistrationProducer.cs +++ b/src/NuGet.Services.AzureSearch/Db2AzureSearch/NewPackageRegistrationProducer.cs @@ -87,7 +87,7 @@ private bool ShouldWait(ConcurrentBag allWork, bool log) if (log) { _logger.LogInformation( - "There are {PackageCount} packages to in memory, waiting to be pushed to Azure Search. " + + "There are {PackageCount} packages in memory waiting to be pushed to Azure Search. " + "Waiting until this number drops below {Max} before fetching more packages.", packageCount, max); diff --git a/src/NuGet.Services.AzureSearch/DocumentUtilities.cs b/src/NuGet.Services.AzureSearch/DocumentUtilities.cs index 47e952c05..e738ec8ec 100644 --- a/src/NuGet.Services.AzureSearch/DocumentUtilities.cs +++ b/src/NuGet.Services.AzureSearch/DocumentUtilities.cs @@ -103,6 +103,7 @@ public static void PopulateMetadata( document.Summary = package.Summary; document.Tags = package.Tags == null ? null : Utils.SplitTags(package.Tags); document.Title = package.Title; + document.TokenizedPackageId = packageId; } public static void PopulateMetadata( @@ -136,6 +137,7 @@ public static void PopulateMetadata( document.Summary = leaf.Summary; document.Tags = leaf.Tags == null ? null : leaf.Tags.ToArray(); document.Title = leaf.Title; + document.TokenizedPackageId = leaf.PackageId; } private static string GetSortableTitle(string title, string packageId) diff --git a/src/NuGet.Services.AzureSearch/IndexBuilder.cs b/src/NuGet.Services.AzureSearch/IndexBuilder.cs index 0a3582b5f..a8faef292 100644 --- a/src/NuGet.Services.AzureSearch/IndexBuilder.cs +++ b/src/NuGet.Services.AzureSearch/IndexBuilder.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; +using System.Collections.Generic; using System.Threading.Tasks; using Microsoft.Azure.Search; using Microsoft.Azure.Search.Models; @@ -92,19 +93,36 @@ private async Task CreateIndexIfNotExistsAsync(Index index) private Index InitializeSearchIndex() { - return new Index - { - Name = _options.Value.SearchIndexName, - Fields = FieldBuilder.BuildForType(), - }; + return InitializeIndex( + _options.Value.SearchIndexName); } private Index InitializeHijackIndex() + { + return InitializeIndex( + _options.Value.HijackIndexName); + } + + private Index InitializeIndex(string name) { return new Index { - Name = _options.Value.HijackIndexName, - Fields = FieldBuilder.BuildForType(), + Name = name, + Fields = FieldBuilder.BuildForType(), + Analyzers = new List + { + DescriptionAnalyzer.Instance, + ExactMatchCustomAnalyzer.Instance, + PackageIdCustomAnalyzer.Instance, + }, + Tokenizers = new List + { + PackageIdCustomTokenizer.Instance, + }, + TokenFilters = new List + { + IdentifierCustomTokenFilter.Instance, + } }; } } diff --git a/src/NuGet.Services.AzureSearch/Models/BaseMetadataDocument.cs b/src/NuGet.Services.AzureSearch/Models/BaseMetadataDocument.cs index f6c339786..f64cb0f94 100644 --- a/src/NuGet.Services.AzureSearch/Models/BaseMetadataDocument.cs +++ b/src/NuGet.Services.AzureSearch/Models/BaseMetadataDocument.cs @@ -14,12 +14,14 @@ public abstract class BaseMetadataDocument : CommittedDocument, IBaseMetadataDoc public int? SemVerLevel { get; set; } [IsSearchable] + [Analyzer(DescriptionAnalyzer.Name)] public string Authors { get; set; } public string Copyright { get; set; } public DateTimeOffset? Created { get; set; } [IsSearchable] + [Analyzer(DescriptionAnalyzer.Name)] public string Description { get; set; } public long? FileSize { get; set; } @@ -36,11 +38,16 @@ public abstract class BaseMetadataDocument : CommittedDocument, IBaseMetadataDoc public string MinClientVersion { get; set; } [IsSearchable] + [Analyzer(ExactMatchCustomAnalyzer.Name)] public string NormalizedVersion { get; set; } public string OriginalVersion { get; set; } + /// + /// The package's identifier. Supports case insensitive exact matching. + /// [IsSearchable] + [Analyzer(ExactMatchCustomAnalyzer.Name)] public string PackageId { get; set; } [IsFilterable] @@ -58,12 +65,21 @@ public abstract class BaseMetadataDocument : CommittedDocument, IBaseMetadataDoc public string SortableTitle { get; set; } [IsSearchable] + [Analyzer(DescriptionAnalyzer.Name)] public string Summary { get; set; } [IsSearchable] public string[] Tags { get; set; } [IsSearchable] + [Analyzer(DescriptionAnalyzer.Name)] public string Title { get; set; } + + /// + /// The package's identifier. Supports tokenized search. + /// + [IsSearchable] + [Analyzer(PackageIdCustomAnalyzer.Name)] + public string TokenizedPackageId { get; set; } } } diff --git a/src/NuGet.Services.AzureSearch/Models/IBaseMetadataDocument.cs b/src/NuGet.Services.AzureSearch/Models/IBaseMetadataDocument.cs index 5944d1cc1..7ae2587e4 100644 --- a/src/NuGet.Services.AzureSearch/Models/IBaseMetadataDocument.cs +++ b/src/NuGet.Services.AzureSearch/Models/IBaseMetadataDocument.cs @@ -26,6 +26,7 @@ public interface IBaseMetadataDocument : ICommittedDocument string NormalizedVersion { get; set; } string OriginalVersion { get; set; } string PackageId { get; set; } + string TokenizedPackageId { get; set; } bool? Prerelease { get; set; } string ProjectUrl { get; set; } DateTimeOffset? Published { get; set; } diff --git a/src/NuGet.Services.AzureSearch/Models/SearchDocument.cs b/src/NuGet.Services.AzureSearch/Models/SearchDocument.cs index 9e038a19b..d6cc696c2 100644 --- a/src/NuGet.Services.AzureSearch/Models/SearchDocument.cs +++ b/src/NuGet.Services.AzureSearch/Models/SearchDocument.cs @@ -29,6 +29,7 @@ public class Full : AddFirst public class AddFirst : UpdateLatest { [IsSearchable] + [Analyzer(ExactMatchCustomAnalyzer.Name)] public string[] Owners { get; set; } } diff --git a/src/NuGet.Services.AzureSearch/NuGet.Services.AzureSearch.csproj b/src/NuGet.Services.AzureSearch/NuGet.Services.AzureSearch.csproj index 35b1cdb35..306f96d21 100644 --- a/src/NuGet.Services.AzureSearch/NuGet.Services.AzureSearch.csproj +++ b/src/NuGet.Services.AzureSearch/NuGet.Services.AzureSearch.csproj @@ -43,11 +43,16 @@ + + + + + diff --git a/src/NuGet.Services.AzureSearch/SearchService/IndexFields.cs b/src/NuGet.Services.AzureSearch/SearchService/IndexFields.cs index 6b2305203..198063cee 100644 --- a/src/NuGet.Services.AzureSearch/SearchService/IndexFields.cs +++ b/src/NuGet.Services.AzureSearch/SearchService/IndexFields.cs @@ -25,6 +25,7 @@ private static string Name(string input) public static readonly string Summary = Name(nameof(BaseMetadataDocument.Summary)); public static readonly string Tags = Name(nameof(BaseMetadataDocument.Tags)); public static readonly string Title = Name(nameof(BaseMetadataDocument.Title)); + public static readonly string TokenizedPackageId = Name(nameof(BaseMetadataDocument.TokenizedPackageId)); public static class Search { diff --git a/src/NuGet.Services.AzureSearch/SearchService/SearchTextBuilder.cs b/src/NuGet.Services.AzureSearch/SearchService/SearchTextBuilder.cs index f69c9c6a8..be17e700d 100644 --- a/src/NuGet.Services.AzureSearch/SearchService/SearchTextBuilder.cs +++ b/src/NuGet.Services.AzureSearch/SearchService/SearchTextBuilder.cs @@ -18,7 +18,7 @@ public class SearchTextBuilder : ISearchTextBuilder { { QueryField.Author, IndexFields.Authors }, { QueryField.Description, IndexFields.Description }, - { QueryField.Id, IndexFields.PackageId }, + { QueryField.Id, IndexFields.TokenizedPackageId }, { QueryField.Owner, IndexFields.Search.Owners }, { QueryField.PackageId, IndexFields.PackageId }, { QueryField.Summary, IndexFields.Summary }, diff --git a/tests/NuGet.Services.AzureSearch.Tests/HijackDocumentBuilderFacts.cs b/tests/NuGet.Services.AzureSearch.Tests/HijackDocumentBuilderFacts.cs index 75067c3ad..8c0e2b23c 100644 --- a/tests/NuGet.Services.AzureSearch.Tests/HijackDocumentBuilderFacts.cs +++ b/tests/NuGet.Services.AzureSearch.Tests/HijackDocumentBuilderFacts.cs @@ -155,6 +155,7 @@ public async Task SetsExpectedProperties() ""windowsazureofficial"" ], ""title"": ""Windows Azure Storage"", + ""tokenizedPackageId"": ""WindowsAzure.Storage"", ""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"", ""lastDocumentType"": ""NuGet.Services.AzureSearch.HijackDocument+Full"", ""lastUpdatedFromCatalog"": false, @@ -288,6 +289,7 @@ public async Task SetsExpectedProperties() ""windowsazureofficial"" ], ""title"": ""Windows Azure Storage"", + ""tokenizedPackageId"": ""WindowsAzure.Storage"", ""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"", ""lastDocumentType"": ""NuGet.Services.AzureSearch.HijackDocument+Full"", ""lastUpdatedFromCatalog"": true, diff --git a/tests/NuGet.Services.AzureSearch.Tests/SearchDocumentBuilderFacts.cs b/tests/NuGet.Services.AzureSearch.Tests/SearchDocumentBuilderFacts.cs index a062e10a8..418c30a73 100644 --- a/tests/NuGet.Services.AzureSearch.Tests/SearchDocumentBuilderFacts.cs +++ b/tests/NuGet.Services.AzureSearch.Tests/SearchDocumentBuilderFacts.cs @@ -276,6 +276,7 @@ public async Task SetsExpectedProperties(SearchFilters searchFilters, string exp ""windowsazureofficial"" ], ""title"": ""Windows Azure Storage"", + ""tokenizedPackageId"": ""WindowsAzure.Storage"", ""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"", ""lastDocumentType"": ""NuGet.Services.AzureSearch.SearchDocument+UpdateLatest"", ""lastUpdatedFromCatalog"": true, @@ -446,6 +447,7 @@ public async Task SetsExpectedProperties(SearchFilters searchFilters, string exp ""windowsazureofficial"" ], ""title"": ""Windows Azure Storage"", + ""tokenizedPackageId"": ""WindowsAzure.Storage"", ""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"", ""lastDocumentType"": ""NuGet.Services.AzureSearch.SearchDocument+Full"", ""lastUpdatedFromCatalog"": false, diff --git a/tests/NuGet.Services.AzureSearch.Tests/SearchService/SearchTextBuilderFacts.cs b/tests/NuGet.Services.AzureSearch.Tests/SearchService/SearchTextBuilderFacts.cs index 9b396ac76..6d7dbe535 100644 --- a/tests/NuGet.Services.AzureSearch.Tests/SearchService/SearchTextBuilderFacts.cs +++ b/tests/NuGet.Services.AzureSearch.Tests/SearchService/SearchTextBuilderFacts.cs @@ -21,11 +21,8 @@ public void GeneratesAzureSearchQuery(string input, string expected) Assert.Equal(expected, actual); } - // TODO: id and packageId query fields should map to different fields with different analyzers - // See: https://github.com/NuGet/NuGetGallery/issues/6920 - // See: https://github.com/NuGet/NuGetGallery/issues/6922 [Theory] - [InlineData(false, "packageId:hello")] + [InlineData(false, "tokenizedPackageId:hello")] [InlineData(true, "packageId:hello")] public void WhenLuceneQuery_TreatsLeadingIdAsPackageId(bool luceneQuery, string expected) { @@ -122,11 +119,7 @@ public static IEnumerable CommonAzureSearchQueryData() { "", "*" }, { " ", "*" }, - // TODO: id should support partial matching - // TODO: packageId, version, and owners should be case insensitive - // See: https://github.com/NuGet/NuGetGallery/issues/6920 - // See: https://github.com/NuGet/NuGetGallery/issues/6922 - { "id:test", "packageId:test" }, + { "id:test", "tokenizedPackageId:test" }, { "packageId:json", "packageId:json" }, { "version:1.0.0-test", "normalizedVersion:1.0.0\\-test" }, { "title:hello", "title:hello" }, @@ -140,7 +133,7 @@ public static IEnumerable CommonAzureSearchQueryData() { "owners:nugget", "owners:nugget" }, // The NuGet query fields are case insensitive - { "ID:TEST", "packageId:TEST" }, + { "ID:TEST", "tokenizedPackageId:TEST" }, { "PACKAGEID:JSON", "packageId:JSON" }, { "VERSION:1.0.0-TEST", "normalizedVersion:1.0.0\\-TEST" }, { "TITLE:HELLO", "title:HELLO" }, @@ -165,11 +158,7 @@ public static IEnumerable CommonAzureSearchQueryData() { "tag:a,b;c|d", "tags:(a b c d)" }, { "tags:a,b;c|d", "tags:(a b c d)" }, - // TODO: id should support partial matching - // TODO: packageId, version, and owners should be case insensitive - // See: https://github.com/NuGet/NuGetGallery/issues/6920 - // See: https://github.com/NuGet/NuGetGallery/issues/6922 - { "id:foo id:bar", "packageId:(foo bar)" }, + { "id:foo id:bar", "tokenizedPackageId:(foo bar)" }, { "packageId:foo packageId:bar", "packageId:(foo bar)" }, { "title:hello title:world", "title:(hello world)" }, { "description:I description:am", "description:(I am)" }, @@ -192,8 +181,8 @@ public static IEnumerable CommonAzureSearchQueryData() // Quotes allow adjacent terms to be searched { @"""foo bar""", @"""foo bar""" }, { @"""foo bar"" baz", @"""foo bar"" baz" }, - { @"id:""foo bar""", @"packageId:""foo bar""" }, - { @"id:""a b"" c id:d packageId:e f", @"+packageId:(""a b"" d e) c f" }, + { @"title:""foo bar""", @"title:""foo bar""" }, + { @"title:""a b"" c title:d f", @"+title:(""a b"" d) c f" }, // Duplicate search terms on the same query field are folded { "a a", "a" },