Skip to content
This repository has been archived by the owner on Mar 16, 2021. It is now read-only.

Commit

Permalink
Add popularity transfers comparer (#766)
Browse files Browse the repository at this point in the history
The `auxiliary2azuresearch` job needs to know which popularity transfers have changed to properly update the search index.

Previous change: #765
Part of NuGet/NuGetGallery#7898
  • Loading branch information
loic-sharma authored Apr 14, 2020
1 parent 11e04f1 commit 981ec3c
Show file tree
Hide file tree
Showing 12 changed files with 590 additions and 287 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Microsoft.Extensions.Logging;

namespace NuGet.Services.AzureSearch.Auxiliary2AzureSearch
{
public class DataSetComparer : IDataSetComparer
{
private static readonly string[] EmptyStringArray = new string[0];

private readonly IAzureSearchTelemetryService _telemetryService;
private readonly ILogger<DataSetComparer> _logger;

public DataSetComparer(
IAzureSearchTelemetryService telemetryService,
ILogger<DataSetComparer> logger)
{
_telemetryService = telemetryService ?? throw new ArgumentNullException(nameof(telemetryService));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}

public SortedDictionary<string, string[]> CompareOwners(
SortedDictionary<string, SortedSet<string>> oldData,
SortedDictionary<string, SortedSet<string>> newData)
{
// Use ordinal comparison to allow username case changes to flow through.
var stopwatch = Stopwatch.StartNew();
var result = CompareData(
oldData,
newData,
"package ID",
"owners",
StringComparer.Ordinal);

stopwatch.Stop();
_telemetryService.TrackOwnerSetComparison(oldData.Count, newData.Count, result.Count, stopwatch.Elapsed);

return result;
}

public SortedDictionary<string, string[]> ComparePopularityTransfers(
SortedDictionary<string, SortedSet<string>> oldData,
SortedDictionary<string, SortedSet<string>> newData)
{
// Ignore case changes in popularity transfers.
var stopwatch = Stopwatch.StartNew();
var result = CompareData(
oldData,
newData,
"package ID",
"popularity transfers",
StringComparer.OrdinalIgnoreCase);

stopwatch.Stop();
_telemetryService.TrackPopularityTransfersSetComparison(oldData.Count, newData.Count, result.Count, stopwatch.Elapsed);

return result;
}

private SortedDictionary<string, string[]> CompareData(
SortedDictionary<string, SortedSet<string>> oldData,
SortedDictionary<string, SortedSet<string>> newData,
string keyName,
string valuesName,
StringComparer valuesComparer)
{
if (oldData.Comparer != StringComparer.OrdinalIgnoreCase)
{
throw new ArgumentException("The old data should have a case-insensitive comparer.", nameof(oldData));
}

if (newData.Comparer != StringComparer.OrdinalIgnoreCase)
{
throw new ArgumentException("The new data should have a case-insensitive comparer.", nameof(newData));
}

// We use a very simplistic algorithm here. Perform one pass on the new data to find the added or changed
// values. Then perform a second pass on the old data to find removed keys. We can optimize
// this later if necessary.
//
// On the "changed" case, we emit all of the values instead of the delta. This is because Azure Search
// does not have a way to append or remove a specific item from a field that is an array.
// The entire new array needs to be provided.
var result = new SortedDictionary<string, string[]>(StringComparer.OrdinalIgnoreCase);

// First pass: find added or changed sets.
foreach (var pair in newData)
{
var key = pair.Key;
var newValues = pair.Value;
if (!oldData.TryGetValue(key, out var oldValues))
{
// ADDED: The key does not exist in the old data, which means the key was added.
result.Add(key, newValues.ToArray());
_logger.LogInformation(
$"The {keyName} {{Key}} has been added, with {{AddedCount}} {valuesName}.",
key,
newValues.Count);
}
else
{
// The key exists in the old data. We need to check if the values set has changed.
var removedValues = oldValues.Except(newValues, valuesComparer).ToList();
var addedValues = newValues.Except(oldValues, valuesComparer).ToList();

if (removedValues.Any() || addedValues.Any())
{
// CHANGED: The values set has changed.
result.Add(key, newValues.ToArray());
_logger.LogInformation(
$"The {keyName} {{Key}} {valuesName} have changed, with {{RemovedCount}} {valuesName} removed and " +
$"{{AddedCount}} {valuesName} added.",
key,
removedValues.Count,
addedValues.Count);
}
}
}

// Second pass: find removed sets.
foreach (var pair in oldData)
{
var key = pair.Key;
var oldValues = pair.Value;

if (!newData.TryGetValue(key, out var newValues))
{
// REMOVED: The key does not exist in the new data, which means the key was removed.
result.Add(key, EmptyStringArray);
_logger.LogInformation(
$"The {keyName} {{Key}} has been removed, with {{RemovedCount}} {valuesName}",
key,
oldValues.Count);
}
}

return result;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
namespace NuGet.Services.AzureSearch.Auxiliary2AzureSearch
{
/// <summary>
/// Used to compare two sets of owners to determine the changes.
/// Used to compare two sets of data to determine the changes.
/// </summary>
public interface IOwnerSetComparer
public interface IDataSetComparer
{
/// <summary>
/// Compares two sets of owners to determine the package IDs that have changed. The returned dictionary
Expand All @@ -19,7 +19,19 @@ public interface IOwnerSetComparer
/// </summary>
/// <param name="oldData">The old owner information, typically from storage.</param>
/// <param name="newData">The new owner information, typically from gallery DB.</param>
SortedDictionary<string, string[]> Compare(
SortedDictionary<string, string[]> CompareOwners(
SortedDictionary<string, SortedSet<string>> oldData,
SortedDictionary<string, SortedSet<string>> newData);

/// <summary>
/// Compares two sets of popularity transfers to determine changes. The two inputs are maps of package IDs that transfer
/// popularity away to package IDs that receive the popularity. The returned dictionary is subset of these inputs that
/// were added, removed, or changed. For the "added" and "changed" cases, the popularity transfer set is the new data.
/// For the "removed" case, the set is empty.
/// </summary>
/// <param name="oldData">The old popularity transfers, typically from storage.</param>
/// <param name="newData">The new popularity transfers, typically from gallery DB.</param>
SortedDictionary<string, string[]> ComparePopularityTransfers(
SortedDictionary<string, SortedSet<string>> oldData,
SortedDictionary<string, SortedSet<string>> newData);
}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public class UpdateOwnersCommand : IAzureSearchCommand
{
private readonly IDatabaseAuxiliaryDataFetcher _databaseFetcher;
private readonly IOwnerDataClient _ownerDataClient;
private readonly IOwnerSetComparer _ownerSetComparer;
private readonly IDataSetComparer _ownerSetComparer;
private readonly ISearchDocumentBuilder _searchDocumentBuilder;
private readonly ISearchIndexActionBuilder _searchIndexActionBuilder;
private readonly Func<IBatchPusher> _batchPusherFactory;
Expand All @@ -28,7 +28,7 @@ public class UpdateOwnersCommand : IAzureSearchCommand
public UpdateOwnersCommand(
IDatabaseAuxiliaryDataFetcher databaseFetcher,
IOwnerDataClient ownerDataClient,
IOwnerSetComparer ownerSetComparer,
IDataSetComparer ownerSetComparer,
ISearchDocumentBuilder searchDocumentBuilder,
ISearchIndexActionBuilder indexActionBuilder,
Func<IBatchPusher> batchPusherFactory,
Expand Down Expand Up @@ -67,7 +67,7 @@ public async Task ExecuteAsync()
var databaseResult = await _databaseFetcher.GetPackageIdToOwnersAsync();

_logger.LogInformation("Detecting owner changes.");
var changes = _ownerSetComparer.Compare(storageResult.Result, databaseResult);
var changes = _ownerSetComparer.CompareOwners(storageResult.Result, databaseResult);
var changesBag = new ConcurrentBag<IdAndValue<string[]>>(changes.Select(x => new IdAndValue<string[]>(x.Key, x.Value)));
_logger.LogInformation("{Count} package IDs have owner changes.", changesBag.Count);

Expand Down
13 changes: 13 additions & 0 deletions src/NuGet.Services.AzureSearch/AzureSearchTelemetryService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,19 @@ public void TrackReadLatestIndexedPopularityTransfers(int outgoingTransfers, Tim
});
}

public void TrackPopularityTransfersSetComparison(int oldCount, int newCount, int changeCount, TimeSpan elapsed)
{
_telemetryClient.TrackMetric(
Prefix + "PopularityTransfersSetComparisonSeconds",
elapsed.TotalSeconds,
new Dictionary<string, string>
{
{ "OldCount", oldCount.ToString() },
{ "NewCount", oldCount.ToString() },
{ "ChangeCount", oldCount.ToString() },
});
}

public IDisposable TrackReplaceLatestIndexedPopularityTransfers(int outogingTransfers)
{
return _telemetryClient.TrackDuration(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,14 +249,14 @@ public static IServiceCollection AddAzureSearch(
services.AddTransient<ICatalogLeafFetcher, CatalogLeafFetcher>();
services.AddTransient<ICommitCollectorLogic, AzureSearchCollectorLogic>();
services.AddTransient<IDatabaseAuxiliaryDataFetcher, DatabaseAuxiliaryDataFetcher>();
services.AddTransient<IDataSetComparer, DataSetComparer>();
services.AddTransient<IDocumentFixUpEvaluator, DocumentFixUpEvaluator>();
services.AddTransient<IDownloadSetComparer, DownloadSetComparer>();
services.AddTransient<IEntitiesContextFactory, EntitiesContextFactory>();
services.AddTransient<IHijackDocumentBuilder, HijackDocumentBuilder>();
services.AddTransient<IIndexBuilder, IndexBuilder>();
services.AddTransient<IIndexOperationBuilder, IndexOperationBuilder>();
services.AddTransient<INewPackageRegistrationProducer, NewPackageRegistrationProducer>();
services.AddTransient<IOwnerSetComparer, OwnerSetComparer>();
services.AddTransient<IPackageEntityIndexActionBuilder, PackageEntityIndexActionBuilder>();
services.AddTransient<ISearchDocumentBuilder, SearchDocumentBuilder>();
services.AddTransient<ISearchIndexActionBuilder, SearchIndexActionBuilder>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public interface IAzureSearchTelemetryService
void TrackOwnerSetComparison(int oldCount, int newCount, int changeCount, TimeSpan elapsed);
void TrackReadLatestIndexedOwners(int packageIdCount, TimeSpan elapsed);
void TrackReadLatestOwnersFromDatabase(int packageIdCount, TimeSpan elapsed);
void TrackPopularityTransfersSetComparison(int oldCount, int newCount, int changeCount, TimeSpan elapsed);
void TrackReadLatestIndexedPopularityTransfers(int outgoingTransfers, TimeSpan elapsed);
void TrackReadLatestVerifiedPackagesFromDatabase(int packageIdCount, TimeSpan elapsed);
IDisposable TrackReplaceLatestIndexedOwners(int packageIdCount);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@
<Compile Include="JobOutcome.cs" />
<Compile Include="Models\IUpdatedDocument.cs" />
<Compile Include="Models\UpdatedDocument.cs" />
<Compile Include="Auxiliary2AzureSearch\IOwnerSetComparer.cs" />
<Compile Include="PackageIdToPopularityTransfersBuilder.cs" />
<Compile Include="Auxiliary2AzureSearch\IDataSetComparer.cs" />
<Compile Include="SearchIndexActionBuilder.cs" />
<Compile Include="Auxiliary2AzureSearch\OwnerSetComparer.cs" />
<Compile Include="Auxiliary2AzureSearch\DataSetComparer.cs" />
<Compile Include="PackageIdToOwnersBuilder.cs" />
<Compile Include="AuxiliaryFiles\IOwnerDataClient.cs" />
<Compile Include="AuxiliaryFiles\OwnerDataClient.cs" />
Expand Down
Loading

0 comments on commit 981ec3c

Please sign in to comment.