Skip to content
This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
/ NuGet.Jobs Public archive

[GitHub Indexer] Search for popular GitHub repositories #770

Merged
merged 45 commits into from
Jun 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ba497cb
[GH Index] Initial commit
mogah Jun 13, 2019
d31c59e
[GH Index] Fixed build
mogah Jun 13, 2019
ae29856
Added License headers
mogah Jun 14, 2019
388b9d7
Changed Nuspec Id
mogah Jun 14, 2019
d1e407f
Changed Nuspec script include
mogah Jun 14, 2019
d0fcfe1
Added empty job
mogah Jun 14, 2019
6de40f3
[GH Idx] Added Octokit and LibGit2Sharp dependencies
mogah Jun 14, 2019
2d904d3
[GH Idx] Add initial GHSearcher
mogah Jun 14, 2019
ca96d25
[GH Idx] Add GitRepoSearcher
mogah Jun 14, 2019
2d346b5
[GH Idx] Add dependency injection
mogah Jun 15, 2019
eab0c9c
[GH Idx] Add null check
mogah Jun 17, 2019
d9a8eb2
[GH Idx] Add tests
mogah Jun 17, 2019
1a5ccba
[GH Idx] Extracted constants
mogah Jun 18, 2019
515bdd9
[GH Idx] Fixed tests
mogah Jun 18, 2019
a6d0256
Merge remote-tracking branch 'origin/dev' into mogah-github-indexer
mogah Jun 18, 2019
8b5e2ca
Update src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHubSearcher.cs
mogah Jun 18, 2019
e3fc3a4
Update src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHubSearcher.cs
mogah Jun 18, 2019
6147012
[GH Idx] Removed duplicate class RepositoryInformation
mogah Jun 18, 2019
736df94
[GH Idx] Refactored the code a bit
mogah Jun 18, 2019
228e82e
[GH Idx] Fix possible deadlock
mogah Jun 18, 2019
ae4d8d2
[GH Idx] Add config section in the appsettings.json
mogah Jun 19, 2019
e4ef982
[GH Idx] GitHubSearcher is not recursive anymore!
mogah Jun 19, 2019
a03ddfc
[GH Idx] Removed redundant comparer
mogah Jun 19, 2019
a82c009
[GH Idx] Fix upperStarBound wrongly set on request
mogah Jun 19, 2019
d44803e
[GH Idx] Fixed sleep time
mogah Jun 20, 2019
f23f800
[GH Idx] Fix typo
mogah Jun 21, 2019
d34a3f4
[GH Idx] Made fields private
mogah Jun 21, 2019
c1c35df
[GH Idx] Changed UA
mogah Jun 21, 2019
8a91ad7
[GH Idx] Made the configuration not static
mogah Jun 21, 2019
b5c38ba
[GH Idx] Add ApiInfo doc in the tests
mogah Jun 21, 2019
677a900
[GH Idx] Refactor GH Search API requester
mogah Jun 24, 2019
422f9c6
[GH Idx] Removed redundant import in csproj
mogah Jun 24, 2019
6096aa1
[GH Idx] Add documentation to the configuration
mogah Jun 24, 2019
6d2731d
[GH Idx] Move the IGitHubClient to the GitHubSearchWrapper
mogah Jun 24, 2019
b904dff
[GH Idx] Remove redundant variable
mogah Jun 24, 2019
a411667
[GH Idx] Trim tests Assembly info
mogah Jun 24, 2019
4648816
[GH Idx] Add checks to ensure the required info is in the GitHub resp…
mogah Jun 24, 2019
c9fbb29
[GH Idx] Moved public method before private methods
mogah Jun 24, 2019
c2e771d
[GH Idx] Extract retry time in a static variable
mogah Jun 24, 2019
bbc09fb
[GH Idx] Add typecheck and fix tests
mogah Jun 24, 2019
73c268a
[GH Idx] Remove redundant using
mogah Jun 24, 2019
63aca24
[GH Idx] Nit space formatting
mogah Jun 24, 2019
07a2e45
[GH Idx] Change UserAgent to use assembly name and version
mogah Jun 24, 2019
8782195
[GH Idx] Remove extra line
mogah Jun 25, 2019
a25c3a2
[GH Idx] Fix nit picks
mogah Jun 25, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions NuGet.Jobs.sln
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.28902.138
# Visual Studio 15
VisualStudioVersion = 15.0.28307.645
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NuGet.Jobs.Common", "src\NuGet.Jobs.Common\NuGet.Jobs.Common.csproj", "{4B4B1EFB-8F33-42E6-B79F-54E7F3293D31}"
EndProject
Expand Down Expand Up @@ -151,6 +151,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NuGet.Jobs.GitHubIndexer",
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TestUtil", "tests\TestUtil\TestUtil.csproj", "{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NuGet.Jobs.GitHubIndexer.Tests", "tests\NuGet.Jobs.GitHubIndexer.Tests\NuGet.Jobs.GitHubIndexer.Tests.csproj", "{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -399,6 +401,10 @@ Global
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}.Release|Any CPU.Build.0 = Release|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -464,6 +470,7 @@ Global
{D3F1711A-25AC-4EC9-9971-4F838BCD2A07} = {6A776396-02B1-475D-A104-26940ADB04AB}
{42B1EB66-58F9-4D9A-8E23-FF12CBF5D643} = {FA5644B5-4F08-43F6-86B3-039374312A47}
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62} = {6A776396-02B1-475D-A104-26940ADB04AB}
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7} = {6A776396-02B1-475D-A104-26940ADB04AB}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {284A7AC3-FB43-4F1F-9C9C-2AF0E1F46C2B}
Expand Down
23 changes: 23 additions & 0 deletions src/NuGet.Jobs.GitHubIndexer/GitHubSearcherConfiguration.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearcherConfiguration
{
/// <summary>
/// Minimum number of stars that a GitHub Repo needs to have to be included in the indexing
/// </summary>
public int MinStars { get; set; } = 100;
loic-sharma marked this conversation as resolved.
Show resolved Hide resolved

/// <summary>
/// The number of results that would be shown per page. This is currently limited to 100 (limit verified on 6/24/2019)
/// </summary>
public int ResultsPerPage { get; set; } = 100;

/// <summary>
/// The limit of results that a single search query can show. This is currently limited to 1000 (limit verified on 6/24/2019)
/// </summary>
public int MaxGitHubResultsPerQuery { get; set; } = 1000;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using NuGetGallery;

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearchApiResponse
{
public GitHubSearchApiResponse(IReadOnlyList<RepositoryInformation> result, DateTimeOffset date, DateTimeOffset throttleResetTime)
{
Result = result ?? throw new ArgumentNullException(nameof(result));
Date = date;
ThrottleResetTime = throttleResetTime;
}

public IReadOnlyList<RepositoryInformation> Result { get; }
public DateTimeOffset Date { get; }
loic-sharma marked this conversation as resolved.
Show resolved Hide resolved
public DateTimeOffset ThrottleResetTime { get; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using NuGetGallery;
using Octokit;

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearchWrapper : IGitHubSearchWrapper
{
private readonly IGitHubClient _client;

public GitHubSearchWrapper(IGitHubClient client)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
}

public int? GetRemainingRequestCount()
{
var apiInfo = _client.GetLastApiInfo();
return apiInfo?.RateLimit.Remaining;
}

public async Task<GitHubSearchApiResponse> GetResponse(SearchRepositoriesRequest request)
{
var apiResponse = await _client.Connection.Get<SearchRepositoryResult>(ApiUrls.SearchRepositories(), request.Parameters, null);
if (!apiResponse.HttpResponse.Headers.TryGetValue("Date", out var ghStrDate)
|| !DateTime.TryParseExact(ghStrDate, "ddd',' dd MMM yyyy HH:mm:ss 'GMT'", CultureInfo.InvariantCulture, DateTimeStyles.None, out var ghTime))
{
throw new InvalidDataException("Date is required to compute the throttling time.");
}

if (!apiResponse.HttpResponse.Headers.TryGetValue("X-RateLimit-Reset", out var ghStrResetLimit)
|| !long.TryParse(ghStrResetLimit, out var ghResetTime))
{
throw new InvalidDataException("X-RateLimit-Reset is required to compute the throttling time.");
}

return new GitHubSearchApiResponse(
apiResponse.Body.Items
.Select(repo => new RepositoryInformation(
$"{repo.Owner.Login}/{repo.Name}",
repo.HtmlUrl,
repo.StargazersCount,
Array.Empty<string>())).ToList(),
ghTime.ToLocalTime(),
DateTimeOffset.FromUnixTimeSeconds(ghResetTime).ToLocalTime());
}
}
}
162 changes: 162 additions & 0 deletions src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHub/GitHubSearcher.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using NuGetGallery;
using Octokit;

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearcher : IGitRepoSearcher
{
private static readonly TimeSpan LimitExceededRetryTime = TimeSpan.FromSeconds(5);

private readonly ILogger<GitHubSearcher> _logger;
private readonly IOptionsSnapshot<GitHubSearcherConfiguration> _configuration;
private readonly IGitHubSearchWrapper _searchApiRequester;

private DateTimeOffset _throttleResetTime;

public GitHubSearcher(
IGitHubSearchWrapper searchApiRequester,
ILogger<GitHubSearcher> logger,
IOptionsSnapshot<GitHubSearcherConfiguration> configuration)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_configuration = configuration ?? throw new ArgumentNullException(nameof(configuration));
_searchApiRequester = searchApiRequester ?? throw new ArgumentNullException(nameof(searchApiRequester));
}

private int _minStars => _configuration.Value.MinStars;
private int _resultsPerPage => _configuration.Value.ResultsPerPage;
private int _maxGithubResultPerQuery => _configuration.Value.MaxGitHubResultsPerQuery;

/// <summary>
/// Searches for all the C# repos that have more than 100 stars on GitHub, orders them in Descending order and returns them.
/// </summary>
/// <returns>List of C# repos on GitHub that have more than 100 stars</returns>
public async Task<IReadOnlyList<RepositoryInformation>> GetPopularRepositories()
{
_logger.LogInformation("Starting search on GitHub...");
var result = await GetResultsFromGitHub();
return result
.GroupBy(x => x.Id) // Used to remove duplicate repos (since the GH Search API may return a result that we already had in memory)
.Select(g => g.First())
.OrderByDescending(x => x.Stars)
.ToList();
}

private async Task CheckThrottle()
{
if (_searchApiRequester.GetRemainingRequestCount() == 0)
{
var sleepTime = _throttleResetTime - DateTimeOffset.Now;
_throttleResetTime = DateTimeOffset.Now;
if (sleepTime.TotalSeconds > 0)
{
_logger.LogInformation("Waiting {TotalSeconds} seconds to cooldown.", sleepTime.TotalSeconds);
await Task.Delay(sleepTime);
}

_logger.LogInformation("Resuming search.");
}
}

private async Task<IReadOnlyList<RepositoryInformation>> SearchRepo(SearchRepositoriesRequest request)
{
_logger.LogInformation("Requesting page {Page} for stars {Stars}", request.Page, request.Stars);

bool? error = null;
GitHubSearchApiResponse response = null;
while (!error.HasValue || error.Value)
{
try
{
response = await _searchApiRequester.GetResponse(request);
error = false;
}
catch (RateLimitExceededException)
{
_logger.LogError("Exceeded GitHub RateLimit! Waiting for {LimitExceededRetryTime} before retrying.", LimitExceededRetryTime);
await Task.Delay(LimitExceededRetryTime);
}
}

if (_throttleResetTime < DateTimeOffset.Now)
{
var timeToWait = response.ThrottleResetTime - response.Date;
_throttleResetTime = DateTimeOffset.Now + timeToWait;
}

return response.Result;
}

private async Task<List<RepositoryInformation>> GetResultsFromGitHub()
{
_throttleResetTime = DateTimeOffset.Now;
var upperStarBound = int.MaxValue;
var resultList = new List<RepositoryInformation>();
var lastPage = Math.Ceiling(_maxGithubResultPerQuery / (double)_resultsPerPage);

while (upperStarBound >= _minStars)
{
var page = 0;
while (page < lastPage)
{
await CheckThrottle();

var request = new SearchRepositoriesRequest
{
Stars = new Range(_minStars, upperStarBound),
Language = Language.CSharp,
SortField = RepoSearchSort.Stars,
Order = SortDirection.Descending,
PerPage = _resultsPerPage,
Page = page + 1
};

var response = await SearchRepo(request);

if (response == null || !response.Any())
{
_logger.LogWarning("Search request didn't return any item. Page: {Page} {ConfigInfo}", request.Page, GetConfigInfo());
return resultList;
}

// TODO: Block unwanted repos (https://github.com/NuGet/NuGetGallery/issues/7298)
resultList.AddRange(response);
page++;

if (page == lastPage && response.First().Stars == response.Last().Stars)
{
// GitHub throttles us after a certain number of results per query.
// We can only construct queries based on number of stars a repository has.
// As a result, if too many repositories have the same number of stars,
// we will lose data because we can't create another query that filters out the results that we have already seen with the same number of stars.
_logger.LogWarning("Last page results have the same star count! This may result in missing data. StarCount: {Stars} {ConfigInfo}",
response.First().Stars,
GetConfigInfo());

return resultList;
}
}

upperStarBound = resultList.Last().Stars;
}

return resultList;
}

private string GetConfigInfo()
{
return $"MinStars: {_minStars}\n" +
$"ResultsPerPage: {_resultsPerPage}\n" +
$"MaxGithubResultPerQuery: {_maxGithubResultPerQuery}\n";
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Threading.Tasks;
using Octokit;

namespace NuGet.Jobs.GitHubIndexer
{
public interface IGitHubSearchWrapper
{
/// <summary>
/// Queries the GitHub Repo Search Api and returns its reponse
/// </summary>
/// <param name="request">Request to be made to the GitHub Repo Search Api</param>
/// <returns>Parsed reponse of the GitHub Repo Search Api</returns>
Task<GitHubSearchApiResponse> GetResponse(SearchRepositoriesRequest request);

/// <summary>
/// Returns the number of remaining requests before the search gets throttled
/// </summary>
/// <returns>Returns the number of remaining requests or null if no info is available (no request has been done yet)</returns>
int? GetRemainingRequestCount();
}
}
18 changes: 18 additions & 0 deletions src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/IGitRepoSearcher.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Collections.Generic;
using System.Threading.Tasks;
using NuGetGallery;

namespace NuGet.Jobs.GitHubIndexer
{
public interface IGitRepoSearcher
{
/// <summary>
/// Searches for all popular C# repos, orders them in Descending order and returns a list containing their basic information
/// </summary>
/// <returns>List of popular C# repositories</returns>
Task<IReadOnlyList<RepositoryInformation>> GetPopularRepositories();
}
}
Loading