Skip to content
This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
/ NuGet.Jobs Public archive

Commit

Permalink
[Azure Search] Add initial query parsing (#482)
Browse files Browse the repository at this point in the history
Parses NuGet queries and generates an Azure Search query. Addresses NuGet/NuGetGallery#6456
  • Loading branch information
loic-sharma authored Feb 27, 2019
1 parent 64012fc commit 4a91dfe
Show file tree
Hide file tree
Showing 6 changed files with 618 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
<Reference Include="Microsoft.CSharp" />
</ItemGroup>
<ItemGroup>
<Compile Include="SearchService\AzureSearchQueryBuilder.cs" />
<Compile Include="BlobContainerBuilder.cs" />
<Compile Include="IBlobContainerBuilder.cs" />
<Compile Include="Measure.cs" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace NuGet.Services.AzureSearch.SearchService
{
/// <summary>
/// Used to build Azure Search Service queries. Used by <see cref="SearchTextBuilder"/>.
/// Given the query "fieldA:value1 value2":
///
/// * "value1" is a field-scoped value
/// * "value2" is a non-field scoped value
/// </summary>
internal class AzureSearchQueryBuilder
{
/// <summary>
/// Azure Search Queries must have less than 1024 clauses.
/// See: https://docs.microsoft.com/en-us/azure/search/query-lucene-syntax#bkmk_querysizelimits
/// </summary>
private const int MaxClauses = 1024;

/// <summary>
/// Terms in Azure Search Queries must be less than 32KB.
/// See: https://docs.microsoft.com/en-us/azure/search/query-lucene-syntax#bkmk_querysizelimits
/// </summary>
private const int MaxTermSizeBytes = 32 * 1024;

/// <summary>
/// These characters have special meaning in Azure Search and must be escaped if in user input.
/// See: https://docs.microsoft.com/en-us/azure/search/query-lucene-syntax#escaping-special-characters
/// </summary>
private static readonly HashSet<char> SpecialCharacters = new HashSet<char>
{
'+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\', '/',
};

private readonly List<string> _nonFieldScopedValues;
private readonly Dictionary<string, List<string>> _fieldScopedValues;

public AzureSearchQueryBuilder()
{
_nonFieldScopedValues = new List<string>();
_fieldScopedValues = new Dictionary<string, List<string>>();
}

public void AddNonFieldScopedValues(IEnumerable<string> values)
{
_nonFieldScopedValues.AddRange(values);
}

public void AddFieldScopedValues(string fieldName, IEnumerable<string> values)
{
if (!_fieldScopedValues.ContainsKey(fieldName))
{
_fieldScopedValues[fieldName] = new List<string>();
}

_fieldScopedValues[fieldName].AddRange(values);
}

public override string ToString()
{
ValidateOrThrow();

var result = new StringBuilder();

foreach (var fieldScopedTerm in _fieldScopedValues)
{
// At least one term from each field-scope must be matched. As Azure Search queries have an implicit "OR" between
// clauses, we must mark field-scoped term as required if there are multiple top-level clauses.
if (result.Length == 0)
{
// We are building the query's first clause, only add the required operator "+" if there are other top-level clauses.
// We generate a top-level clause for each non-field-scoped term and one for each field-scopes.
if (_nonFieldScopedValues.Count > 0 || _fieldScopedValues.Keys.Count > 1)
{
result.Append('+');
}
}
else
{
// We are adding another top-level clause to the query, always add the required operator "+".
result.Append(" +");
}

result.Append(fieldScopedTerm.Key);
result.Append(':');

if (fieldScopedTerm.Value.Count == 1)
{
AppendEscapedString(result, fieldScopedTerm.Value[0]);
}
else
{
result.Append('(');
AppendEscapedValues(result, fieldScopedTerm.Value);
result.Append(')');
}
}

if (_nonFieldScopedValues.Any())
{
if (result.Length > 0)
{
result.Append(' ');
}

AppendEscapedValues(result, _nonFieldScopedValues);
}

return result.ToString();
}

private static void AppendEscapedValues(StringBuilder result, IReadOnlyList<string> values)
{
for (var i = 0; i < values.Count; i++)
{
if (i > 0)
{
result.Append(' ');
}

AppendEscapedString(result, values[i]);
}
}

private static void AppendEscapedString(StringBuilder result, string input)
{
var originalLength = result.Length;

var wrapWithQuotes = input.Any(char.IsWhiteSpace);
if (wrapWithQuotes)
{
result.Append('"');
}

for (var i = 0; i < input.Length; i++)
{
var c = input[i];
if (SpecialCharacters.Contains(c))
{
if (originalLength == result.Length)
{
result.Append(input.Substring(0, i));
}

result.Append('\\');
result.Append(c);
}
else if (result.Length != originalLength)
{
result.Append(c);
}
}

if (wrapWithQuotes)
{
result.Append('"');
}

if (result.Length == originalLength)
{
result.Append(input);
}
}

private void ValidateOrThrow()
{
// Azure Search has a limit on the number of clauses in a single query.
// We generate a clause for each value in a field-scope, each field-scope,
// and each non-field-scoped value.
var fieldScopedClauses = _fieldScopedValues.Sum(CountFieldScopedClauses);
var nonFieldScopedClauses = _nonFieldScopedValues.Count;

if ((fieldScopedClauses + nonFieldScopedClauses) > MaxClauses)
{
throw new InvalidOperationException($"A query can only have up to {MaxClauses} clauses");
}

if (_fieldScopedValues.Values.Any(terms => terms.Any(TermExceedsMaxSize))
|| _nonFieldScopedValues.Any(TermExceedsMaxSize))
{
throw new InvalidOperationException($"Query terms cannot exceed {MaxTermSizeBytes} bytes");
}
}

private static int CountFieldScopedClauses(KeyValuePair<string, List<string>> fieldScopedValues)
{
// We will only generate a single clause if this field-scope only has a single term.
if (fieldScopedValues.Value.Count == 1)
{
return 1;
}

// Otherwise, we will generate a clause for each term and a clause to OR the terms together.
return fieldScopedValues.Value.Count + 1;
}

private static bool TermExceedsMaxSize(string term)
{
return (Encoding.Unicode.GetByteCount(term) > MaxTermSizeBytes);
}
}
}
34 changes: 17 additions & 17 deletions src/NuGet.Services.AzureSearch/SearchService/IndexFields.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,27 @@ public static class IndexFields
{
private static readonly NamingStrategy CamelCaseNamingStrategy = new CamelCaseNamingStrategy();

public static readonly string LastEdited = CamelCaseNamingStrategy.GetPropertyName(
nameof(BaseMetadataDocument.LastEdited),
hasSpecifiedName: false);

public static readonly string Published = CamelCaseNamingStrategy.GetPropertyName(
nameof(BaseMetadataDocument.Published),
hasSpecifiedName: false);

public static readonly string SortableTitle = CamelCaseNamingStrategy.GetPropertyName(
nameof(BaseMetadataDocument.SortableTitle),
hasSpecifiedName: false);
private static string Name(string input)
{
return CamelCaseNamingStrategy.GetPropertyName(input, hasSpecifiedName: false);
}

public static readonly string SemVerLevel = CamelCaseNamingStrategy.GetPropertyName(
nameof(BaseMetadataDocument.SemVerLevel),
hasSpecifiedName: false);
public static readonly string Authors = Name(nameof(BaseMetadataDocument.Authors));
public static readonly string Description = Name(nameof(BaseMetadataDocument.Description));
public static readonly string LastEdited = Name(nameof(BaseMetadataDocument.LastEdited));
public static readonly string NormalizedVersion = Name(nameof(BaseMetadataDocument.NormalizedVersion));
public static readonly string PackageId = Name(nameof(BaseMetadataDocument.PackageId));
public static readonly string Published = Name(nameof(BaseMetadataDocument.Published));
public static readonly string SemVerLevel = Name(nameof(BaseMetadataDocument.SemVerLevel));
public static readonly string SortableTitle = Name(nameof(BaseMetadataDocument.SortableTitle));
public static readonly string Summary = Name(nameof(BaseMetadataDocument.Summary));
public static readonly string Tags = Name(nameof(BaseMetadataDocument.Tags));
public static readonly string Title = Name(nameof(BaseMetadataDocument.Title));

public static class Search
{
public static readonly string SearchFilters = CamelCaseNamingStrategy.GetPropertyName(
nameof(SearchDocument.UpdateLatest.SearchFilters),
hasSpecifiedName: false);
public static readonly string Owners = Name(nameof(SearchDocument.Full.Owners));
public static readonly string SearchFilters = Name(nameof(SearchDocument.UpdateLatest.SearchFilters));
}
}
}
108 changes: 101 additions & 7 deletions src/NuGet.Services.AzureSearch/SearchService/SearchTextBuilder.cs
Original file line number Diff line number Diff line change
@@ -1,18 +1,48 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Linq;
using NuGet.Indexing;
using NuGet.Services.Metadata.Catalog;
using NuGet.Versioning;

namespace NuGet.Services.AzureSearch.SearchService
{
public class SearchTextBuilder : ISearchTextBuilder
{
private const string MatchAllDocumentsQuery = "*";

private static readonly IReadOnlyDictionary<QueryField, string> FieldNames = new Dictionary<QueryField, string>
{
{ QueryField.Author, IndexFields.Authors },
{ QueryField.Description, IndexFields.Description },
{ QueryField.Id, IndexFields.PackageId },
{ QueryField.Owner, IndexFields.Search.Owners },
{ QueryField.PackageId, IndexFields.PackageId },
{ QueryField.Summary, IndexFields.Summary },
{ QueryField.Tag, IndexFields.Tags },
{ QueryField.Title, IndexFields.Title },
{ QueryField.Version, IndexFields.NormalizedVersion },
};

private readonly NuGetQueryParser _parser;

public SearchTextBuilder()
{
_parser = new NuGetQueryParser();
}

public string V2Search(V2SearchRequest request)
{
var query = request.Query;

if (request.LuceneQuery)
// The old V2 search service would treat "id:" queries (~match) in the same way as it did "packageid:" (==match).
// If "id:" is in the query, replace it.
if (request.LuceneQuery && !string.IsNullOrEmpty(query) && query.StartsWith("id:", StringComparison.OrdinalIgnoreCase))
{
// TODO: convert a leading "id:" to "packageid:"
// https://github.com/NuGet/NuGetGallery/issues/6456
query = "packageid:" + query.Substring(3);
}

return GetLuceneQuery(query);
Expand All @@ -23,11 +53,75 @@ public string V3Search(V3SearchRequest request)
return GetLuceneQuery(request.Query);
}

private static string GetLuceneQuery(string query)
private string GetLuceneQuery(string query)
{
if (string.IsNullOrWhiteSpace(query))
{
return MatchAllDocumentsQuery;
}

var grouping = _parser.ParseQuery(query.Trim());
if (!grouping.Any())
{
return MatchAllDocumentsQuery;
}

var result = ToAzureSearchQuery(grouping).ToString();
if (string.IsNullOrWhiteSpace(result))
{
return MatchAllDocumentsQuery;
}

return result;
}

private AzureSearchQueryBuilder ToAzureSearchQuery(Dictionary<QueryField, HashSet<string>> grouping)
{
var result = new AzureSearchQueryBuilder();

foreach (var field in grouping)
{
// Add values that aren't scoped to a field.
if (field.Key == QueryField.Any)
{
result.AddNonFieldScopedValues(field.Value);
}
else if (field.Key != QueryField.Invalid)
{
// Add values that are scoped to a valid field.
var fieldName = FieldNames[field.Key];
var values = ProcessFieldValues(field.Key, field.Value);

result.AddFieldScopedValues(fieldName, values);
}
}

return result;
}

private static IEnumerable<string> ProcessFieldValues(QueryField field, IEnumerable<string> values)
{
// TODO: query parsing
// https://github.com/NuGet/NuGetGallery/issues/6456
return query ?? "*";
switch (field)
{
// Expand tags by their delimiters
case QueryField.Tag:
return values.SelectMany(Utils.SplitTags).Distinct();

// The "version" query field should be normalized if possible.
case QueryField.Version:
return values.Select(value =>
{
if (!NuGetVersion.TryParse(value, out var version))
{
return value;
}

return version.ToNormalizedString();
});

default:
return values;
}
}
}
}
Loading

0 comments on commit 4a91dfe

Please sign in to comment.