Skip to content
This repository has been archived by the owner on Mar 16, 2021. It is now read-only.

Commit

Permalink
[Azure Search] Add custom analyzers (#485)
Browse files Browse the repository at this point in the history
Adds the following custom analyzers:

1. The `ExactMatchCustomAnalyzer` that allows for case insensitive exact matches. This is used for the `packageId`, `version`, `owner`, and `owners` query fields.
2. The `PackageIdCustomAnalyzer` that splits on non alpha-numeric characters and camel casing. This is used by the `id` query field. This corresponds to the existing [`IdentifierAnalyzer`](https://github.com/NuGet/NuGet.Services.Metadata/blob/master/src/NuGet.Indexing/IdentifierAnalyzer.cs)
3. The `DescriptionCustomAnalyzer` that splits on non alpha-numeric characters and camel casing and removes stopwords (like "the" or "and"). This is used by the `title`, `description`, `author`, `authors`, and `summary` query fields. This corresponds to the existing [`DescriptionAnalyzer`](https://github.com/NuGet/NuGet.Services.Metadata/blob/master/src/NuGet.Indexing/DescriptionAnalyzer.cs)

Addresses NuGet/NuGetGallery#6920 and NuGet/NuGetGallery#6922

Also, see functional tests: #487
  • Loading branch information
loic-sharma authored Mar 1, 2019
1 parent 7479ce4 commit d3e6b7a
Show file tree
Hide file tree
Showing 17 changed files with 182 additions and 26 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Collections.Generic;
using Microsoft.Azure.Search.Models;

namespace NuGet.Services.AzureSearch
{
/// <summary>
/// Support for NuGet style description analysis. This splits tokens
/// on non alpha-numeric characters, splits tokens on camel casing,
/// lower cases tokens, and then removes stopwords from tokens.
/// </summary>
public static class DescriptionAnalyzer
{
public const string Name = "nuget_description_analyzer";

public static readonly CustomAnalyzer Instance = new CustomAnalyzer(
Name,
PackageIdCustomTokenizer.Name,
new List<TokenFilterName>
{
IdentifierCustomTokenFilter.Name,
TokenFilterName.Stopwords,
TokenFilterName.Lowercase,
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Collections.Generic;
using Microsoft.Azure.Search.Models;

namespace NuGet.Services.AzureSearch
{
/// <summary>
/// Support for case-insensitive exact matching on a field
/// in an Azure Search index.
/// </summary>
public static class ExactMatchCustomAnalyzer
{
public const string Name = "nuget_exact_match_analyzer";

public static readonly CustomAnalyzer Instance = new CustomAnalyzer(
Name,
TokenizerName.Keyword,
new List<TokenFilterName>
{
TokenFilterName.Lowercase
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using Microsoft.Azure.Search.Models;

namespace NuGet.Services.AzureSearch
{
/// <summary>
/// Splits tokens on camel casing and non alpha-numeric characters.
/// This does not consume the original token. For example, "Foo2Bar.Baz"
/// becomes "Foo", "2", "Bar", "Baz", and "Foo2Bar.Baz".
/// </summary>
public static class IdentifierCustomTokenFilter
{
public const string Name = "nuget_id_filter";

public static WordDelimiterTokenFilter Instance = new WordDelimiterTokenFilter(
Name,
splitOnCaseChange: true,
preserveOriginal: true);
}
}
27 changes: 27 additions & 0 deletions src/NuGet.Services.AzureSearch/Analysis/PackageIdCustomAnalyzer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Collections.Generic;
using Microsoft.Azure.Search.Models;

namespace NuGet.Services.AzureSearch
{
/// <summary>
/// Support for NuGet style identifier analysis. Splits tokens
/// on non alpha-numeric characters and camel casing, and lower
/// cases tokens.
/// </summary>
public static class PackageIdCustomAnalyzer
{
public const string Name = "nuget_package_id_analyzer";

public static readonly CustomAnalyzer Instance = new CustomAnalyzer(
Name,
PackageIdCustomTokenizer.Name,
new List<TokenFilterName>
{
IdentifierCustomTokenFilter.Name,
TokenFilterName.Lowercase,
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using Microsoft.Azure.Search.Models;

namespace NuGet.Services.AzureSearch
{
/// <summary>
/// Splits tokens that on a set of symbols.
/// For example, "Foo.Bar" becomes "Foo" and "Bar".
/// </summary>
public static class PackageIdCustomTokenizer
{
public const string Name = "nuget_package_id_tokenizer";

public static readonly PatternTokenizer Instance = new PatternTokenizer(
Name,
@"[.\-_,;:'*#!~+()\[\]{}]");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ private bool ShouldWait(ConcurrentBag<NewPackageRegistration> allWork, bool log)
if (log)
{
_logger.LogInformation(
"There are {PackageCount} packages to in memory, waiting to be pushed to Azure Search. " +
"There are {PackageCount} packages in memory waiting to be pushed to Azure Search. " +
"Waiting until this number drops below {Max} before fetching more packages.",
packageCount,
max);
Expand Down
2 changes: 2 additions & 0 deletions src/NuGet.Services.AzureSearch/DocumentUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ public static void PopulateMetadata(
document.Summary = package.Summary;
document.Tags = package.Tags == null ? null : Utils.SplitTags(package.Tags);
document.Title = package.Title;
document.TokenizedPackageId = packageId;
}

public static void PopulateMetadata(
Expand Down Expand Up @@ -136,6 +137,7 @@ public static void PopulateMetadata(
document.Summary = leaf.Summary;
document.Tags = leaf.Tags == null ? null : leaf.Tags.ToArray();
document.Title = leaf.Title;
document.TokenizedPackageId = leaf.PackageId;
}

private static string GetSortableTitle(string title, string packageId)
Expand Down
32 changes: 25 additions & 7 deletions src/NuGet.Services.AzureSearch/IndexBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using Microsoft.Azure.Search;
using Microsoft.Azure.Search.Models;
Expand Down Expand Up @@ -92,19 +93,36 @@ private async Task CreateIndexIfNotExistsAsync(Index index)

private Index InitializeSearchIndex()
{
return new Index
{
Name = _options.Value.SearchIndexName,
Fields = FieldBuilder.BuildForType<SearchDocument.Full>(),
};
return InitializeIndex<SearchDocument.Full>(
_options.Value.SearchIndexName);
}

private Index InitializeHijackIndex()
{
return InitializeIndex<HijackDocument.Full>(
_options.Value.HijackIndexName);
}

private Index InitializeIndex<TDocument>(string name)
{
return new Index
{
Name = _options.Value.HijackIndexName,
Fields = FieldBuilder.BuildForType<HijackDocument.Full>(),
Name = name,
Fields = FieldBuilder.BuildForType<TDocument>(),
Analyzers = new List<Analyzer>
{
DescriptionAnalyzer.Instance,
ExactMatchCustomAnalyzer.Instance,
PackageIdCustomAnalyzer.Instance,
},
Tokenizers = new List<Tokenizer>
{
PackageIdCustomTokenizer.Instance,
},
TokenFilters = new List<TokenFilter>
{
IdentifierCustomTokenFilter.Instance,
}
};
}
}
Expand Down
16 changes: 16 additions & 0 deletions src/NuGet.Services.AzureSearch/Models/BaseMetadataDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ public abstract class BaseMetadataDocument : CommittedDocument, IBaseMetadataDoc
public int? SemVerLevel { get; set; }

[IsSearchable]
[Analyzer(DescriptionAnalyzer.Name)]
public string Authors { get; set; }

public string Copyright { get; set; }
public DateTimeOffset? Created { get; set; }

[IsSearchable]
[Analyzer(DescriptionAnalyzer.Name)]
public string Description { get; set; }

public long? FileSize { get; set; }
Expand All @@ -36,11 +38,16 @@ public abstract class BaseMetadataDocument : CommittedDocument, IBaseMetadataDoc
public string MinClientVersion { get; set; }

[IsSearchable]
[Analyzer(ExactMatchCustomAnalyzer.Name)]
public string NormalizedVersion { get; set; }

public string OriginalVersion { get; set; }

/// <summary>
/// The package's identifier. Supports case insensitive exact matching.
/// </summary>
[IsSearchable]
[Analyzer(ExactMatchCustomAnalyzer.Name)]
public string PackageId { get; set; }

[IsFilterable]
Expand All @@ -58,12 +65,21 @@ public abstract class BaseMetadataDocument : CommittedDocument, IBaseMetadataDoc
public string SortableTitle { get; set; }

[IsSearchable]
[Analyzer(DescriptionAnalyzer.Name)]
public string Summary { get; set; }

[IsSearchable]
public string[] Tags { get; set; }

[IsSearchable]
[Analyzer(DescriptionAnalyzer.Name)]
public string Title { get; set; }

/// <summary>
/// The package's identifier. Supports tokenized search.
/// </summary>
[IsSearchable]
[Analyzer(PackageIdCustomAnalyzer.Name)]
public string TokenizedPackageId { get; set; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ public interface IBaseMetadataDocument : ICommittedDocument
string NormalizedVersion { get; set; }
string OriginalVersion { get; set; }
string PackageId { get; set; }
string TokenizedPackageId { get; set; }
bool? Prerelease { get; set; }
string ProjectUrl { get; set; }
DateTimeOffset? Published { get; set; }
Expand Down
1 change: 1 addition & 0 deletions src/NuGet.Services.AzureSearch/Models/SearchDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public class Full : AddFirst
public class AddFirst : UpdateLatest
{
[IsSearchable]
[Analyzer(ExactMatchCustomAnalyzer.Name)]
public string[] Owners { get; set; }
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,16 @@
<Reference Include="Microsoft.CSharp" />
</ItemGroup>
<ItemGroup>
<Compile Include="Analysis\DescriptionCustomAnalyzer.cs" />
<Compile Include="Analysis\IdentifierCustomTokenFilter.cs" />
<Compile Include="Analysis\PackageIdCustomAnalyzer.cs" />
<Compile Include="Analysis\PackageIdCustomTokenizer.cs" />
<Compile Include="SearchService\AzureSearchQueryBuilder.cs" />
<Compile Include="BlobContainerBuilder.cs" />
<Compile Include="IBlobContainerBuilder.cs" />
<Compile Include="Measure.cs" />
<Compile Include="DurationMeasurement.cs" />
<Compile Include="Analysis\ExactMatchCustomAnalyzer.cs" />
<Compile Include="SearchService\AuxiliaryDataCache.cs" />
<Compile Include="SearchService\AuxiliaryFileClient.cs" />
<Compile Include="SearchService\AuxiliaryFileReloader.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ private static string Name(string input)
public static readonly string Summary = Name(nameof(BaseMetadataDocument.Summary));
public static readonly string Tags = Name(nameof(BaseMetadataDocument.Tags));
public static readonly string Title = Name(nameof(BaseMetadataDocument.Title));
public static readonly string TokenizedPackageId = Name(nameof(BaseMetadataDocument.TokenizedPackageId));

public static class Search
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class SearchTextBuilder : ISearchTextBuilder
{
{ QueryField.Author, IndexFields.Authors },
{ QueryField.Description, IndexFields.Description },
{ QueryField.Id, IndexFields.PackageId },
{ QueryField.Id, IndexFields.TokenizedPackageId },
{ QueryField.Owner, IndexFields.Search.Owners },
{ QueryField.PackageId, IndexFields.PackageId },
{ QueryField.Summary, IndexFields.Summary },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ public async Task SetsExpectedProperties()
""windowsazureofficial""
],
""title"": ""Windows Azure Storage"",
""tokenizedPackageId"": ""WindowsAzure.Storage"",
""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"",
""lastDocumentType"": ""NuGet.Services.AzureSearch.HijackDocument+Full"",
""lastUpdatedFromCatalog"": false,
Expand Down Expand Up @@ -288,6 +289,7 @@ public async Task SetsExpectedProperties()
""windowsazureofficial""
],
""title"": ""Windows Azure Storage"",
""tokenizedPackageId"": ""WindowsAzure.Storage"",
""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"",
""lastDocumentType"": ""NuGet.Services.AzureSearch.HijackDocument+Full"",
""lastUpdatedFromCatalog"": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ public async Task SetsExpectedProperties(SearchFilters searchFilters, string exp
""windowsazureofficial""
],
""title"": ""Windows Azure Storage"",
""tokenizedPackageId"": ""WindowsAzure.Storage"",
""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"",
""lastDocumentType"": ""NuGet.Services.AzureSearch.SearchDocument+UpdateLatest"",
""lastUpdatedFromCatalog"": true,
Expand Down Expand Up @@ -446,6 +447,7 @@ public async Task SetsExpectedProperties(SearchFilters searchFilters, string exp
""windowsazureofficial""
],
""title"": ""Windows Azure Storage"",
""tokenizedPackageId"": ""WindowsAzure.Storage"",
""lastUpdatedDocument"": ""2018-12-14T09:30:00+00:00"",
""lastDocumentType"": ""NuGet.Services.AzureSearch.SearchDocument+Full"",
""lastUpdatedFromCatalog"": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@ public void GeneratesAzureSearchQuery(string input, string expected)
Assert.Equal(expected, actual);
}

// TODO: id and packageId query fields should map to different fields with different analyzers
// See: https://github.com/NuGet/NuGetGallery/issues/6920
// See: https://github.com/NuGet/NuGetGallery/issues/6922
[Theory]
[InlineData(false, "packageId:hello")]
[InlineData(false, "tokenizedPackageId:hello")]
[InlineData(true, "packageId:hello")]
public void WhenLuceneQuery_TreatsLeadingIdAsPackageId(bool luceneQuery, string expected)
{
Expand Down Expand Up @@ -122,11 +119,7 @@ public static IEnumerable<object[]> CommonAzureSearchQueryData()
{ "", "*" },
{ " ", "*" },

// TODO: id should support partial matching
// TODO: packageId, version, and owners should be case insensitive
// See: https://github.com/NuGet/NuGetGallery/issues/6920
// See: https://github.com/NuGet/NuGetGallery/issues/6922
{ "id:test", "packageId:test" },
{ "id:test", "tokenizedPackageId:test" },
{ "packageId:json", "packageId:json" },
{ "version:1.0.0-test", "normalizedVersion:1.0.0\\-test" },
{ "title:hello", "title:hello" },
Expand All @@ -140,7 +133,7 @@ public static IEnumerable<object[]> CommonAzureSearchQueryData()
{ "owners:nugget", "owners:nugget" },

// The NuGet query fields are case insensitive
{ "ID:TEST", "packageId:TEST" },
{ "ID:TEST", "tokenizedPackageId:TEST" },
{ "PACKAGEID:JSON", "packageId:JSON" },
{ "VERSION:1.0.0-TEST", "normalizedVersion:1.0.0\\-TEST" },
{ "TITLE:HELLO", "title:HELLO" },
Expand All @@ -165,11 +158,7 @@ public static IEnumerable<object[]> CommonAzureSearchQueryData()
{ "tag:a,b;c|d", "tags:(a b c d)" },
{ "tags:a,b;c|d", "tags:(a b c d)" },

// TODO: id should support partial matching
// TODO: packageId, version, and owners should be case insensitive
// See: https://github.com/NuGet/NuGetGallery/issues/6920
// See: https://github.com/NuGet/NuGetGallery/issues/6922
{ "id:foo id:bar", "packageId:(foo bar)" },
{ "id:foo id:bar", "tokenizedPackageId:(foo bar)" },
{ "packageId:foo packageId:bar", "packageId:(foo bar)" },
{ "title:hello title:world", "title:(hello world)" },
{ "description:I description:am", "description:(I am)" },
Expand All @@ -192,8 +181,8 @@ public static IEnumerable<object[]> CommonAzureSearchQueryData()
// Quotes allow adjacent terms to be searched
{ @"""foo bar""", @"""foo bar""" },
{ @"""foo bar"" baz", @"""foo bar"" baz" },
{ @"id:""foo bar""", @"packageId:""foo bar""" },
{ @"id:""a b"" c id:d packageId:e f", @"+packageId:(""a b"" d e) c f" },
{ @"title:""foo bar""", @"title:""foo bar""" },
{ @"title:""a b"" c title:d f", @"+title:(""a b"" d) c f" },

// Duplicate search terms on the same query field are folded
{ "a a", "a" },
Expand Down

0 comments on commit d3e6b7a

Please sign in to comment.