This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[GitHub Indexer] Search for popular GitHub repositories (#770)
* [GH Index] Initial commit * [GH Index] Fixed build * Added License headers * Changed Nuspec Id * Changed Nuspec script include * Added empty job * [GH Idx] Added Octokit and LibGit2Sharp dependencies * [GH Idx] Add initial GHSearcher * [GH Idx] Add GitRepoSearcher * [GH Idx] Add dependency injection * [GH Idx] Add null check * [GH Idx] Add tests * [GH Idx] Extracted constants * [GH Idx] Fixed tests * Update src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHubSearcher.cs Co-Authored-By: Loïc Sharma <[email protected]> * Update src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHubSearcher.cs Co-Authored-By: Loïc Sharma <[email protected]> * [GH Idx] Removed duplicate class RepositoryInformation * [GH Idx] Refactored the code a bit * [GH Idx] Fix possible deadlock * [GH Idx] Add config section in the appsettings.json * [GH Idx] GitHubSearcher is not recursive anymore! * [GH Idx] Removed redundant comparer * [GH Idx] Fix upperStarBound wrongly set on request * [GH Idx] Fixed sleep time * [GH Idx] Fix typo * [GH Idx] Made fields private * [GH Idx] Changed UA * [GH Idx] Made the configuration not static * [GH Idx] Add ApiInfo doc in the tests * [GH Idx] Refactor GH Search API requester * [GH Idx] Removed redundant import in csproj * [GH Idx] Add documentation to the configuration * [GH Idx] Move the IGitHubClient to the GitHubSearchWrapper * [GH Idx] Remove redundant variable * [GH Idx] Trim tests Assembly info * [GH Idx] Add checks to ensure the required info is in the GitHub response * [GH Idx] Moved public method before private methods * [GH Idx] Extract retry time in a static variable * [GH Idx] Add typecheck and fix tests * [GH Idx] Remove redundant using * [GH Idx] Nit space formatting * [GH Idx] Change UserAgent to use assembly name and version * [GH Idx] Remove extra line * [GH Idx] Fix nit picks
- Loading branch information
Showing
15 changed files
with
544 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
23 changes: 23 additions & 0 deletions
23
src/NuGet.Jobs.GitHubIndexer/GitHubSearcherConfiguration.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
namespace NuGet.Jobs.GitHubIndexer | ||
{ | ||
public class GitHubSearcherConfiguration | ||
{ | ||
/// <summary> | ||
/// Minimum number of stars that a GitHub Repo needs to have to be included in the indexing | ||
/// </summary> | ||
public int MinStars { get; set; } = 100; | ||
|
||
/// <summary> | ||
/// The number of results that would be shown per page. This is currently limited to 100 (limit verified on 6/24/2019) | ||
/// </summary> | ||
public int ResultsPerPage { get; set; } = 100; | ||
|
||
/// <summary> | ||
/// The limit of results that a single search query can show. This is currently limited to 1000 (limit verified on 6/24/2019) | ||
/// </summary> | ||
public int MaxGitHubResultsPerQuery { get; set; } = 1000; | ||
} | ||
} |
23 changes: 23 additions & 0 deletions
23
src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHub/GitHubSearchApiResponse.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using NuGetGallery; | ||
|
||
namespace NuGet.Jobs.GitHubIndexer | ||
{ | ||
public class GitHubSearchApiResponse | ||
{ | ||
public GitHubSearchApiResponse(IReadOnlyList<RepositoryInformation> result, DateTimeOffset date, DateTimeOffset throttleResetTime) | ||
{ | ||
Result = result ?? throw new ArgumentNullException(nameof(result)); | ||
Date = date; | ||
ThrottleResetTime = throttleResetTime; | ||
} | ||
|
||
public IReadOnlyList<RepositoryInformation> Result { get; } | ||
public DateTimeOffset Date { get; } | ||
public DateTimeOffset ThrottleResetTime { get; } | ||
} | ||
} |
55 changes: 55 additions & 0 deletions
55
src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHub/GitHubSearchWrapper.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System; | ||
using System.Globalization; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Threading.Tasks; | ||
using NuGetGallery; | ||
using Octokit; | ||
|
||
namespace NuGet.Jobs.GitHubIndexer | ||
{ | ||
public class GitHubSearchWrapper : IGitHubSearchWrapper | ||
{ | ||
private readonly IGitHubClient _client; | ||
|
||
public GitHubSearchWrapper(IGitHubClient client) | ||
{ | ||
_client = client ?? throw new ArgumentNullException(nameof(client)); | ||
} | ||
|
||
public int? GetRemainingRequestCount() | ||
{ | ||
var apiInfo = _client.GetLastApiInfo(); | ||
return apiInfo?.RateLimit.Remaining; | ||
} | ||
|
||
public async Task<GitHubSearchApiResponse> GetResponse(SearchRepositoriesRequest request) | ||
{ | ||
var apiResponse = await _client.Connection.Get<SearchRepositoryResult>(ApiUrls.SearchRepositories(), request.Parameters, null); | ||
if (!apiResponse.HttpResponse.Headers.TryGetValue("Date", out var ghStrDate) | ||
|| !DateTime.TryParseExact(ghStrDate, "ddd',' dd MMM yyyy HH:mm:ss 'GMT'", CultureInfo.InvariantCulture, DateTimeStyles.None, out var ghTime)) | ||
{ | ||
throw new InvalidDataException("Date is required to compute the throttling time."); | ||
} | ||
|
||
if (!apiResponse.HttpResponse.Headers.TryGetValue("X-RateLimit-Reset", out var ghStrResetLimit) | ||
|| !long.TryParse(ghStrResetLimit, out var ghResetTime)) | ||
{ | ||
throw new InvalidDataException("X-RateLimit-Reset is required to compute the throttling time."); | ||
} | ||
|
||
return new GitHubSearchApiResponse( | ||
apiResponse.Body.Items | ||
.Select(repo => new RepositoryInformation( | ||
$"{repo.Owner.Login}/{repo.Name}", | ||
repo.HtmlUrl, | ||
repo.StargazersCount, | ||
Array.Empty<string>())).ToList(), | ||
ghTime.ToLocalTime(), | ||
DateTimeOffset.FromUnixTimeSeconds(ghResetTime).ToLocalTime()); | ||
} | ||
} | ||
} |
162 changes: 162 additions & 0 deletions
162
src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHub/GitHubSearcher.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Threading.Tasks; | ||
using Microsoft.Extensions.Logging; | ||
using Microsoft.Extensions.Options; | ||
using NuGetGallery; | ||
using Octokit; | ||
|
||
namespace NuGet.Jobs.GitHubIndexer | ||
{ | ||
public class GitHubSearcher : IGitRepoSearcher | ||
{ | ||
private static readonly TimeSpan LimitExceededRetryTime = TimeSpan.FromSeconds(5); | ||
|
||
private readonly ILogger<GitHubSearcher> _logger; | ||
private readonly IOptionsSnapshot<GitHubSearcherConfiguration> _configuration; | ||
private readonly IGitHubSearchWrapper _searchApiRequester; | ||
|
||
private DateTimeOffset _throttleResetTime; | ||
|
||
public GitHubSearcher( | ||
IGitHubSearchWrapper searchApiRequester, | ||
ILogger<GitHubSearcher> logger, | ||
IOptionsSnapshot<GitHubSearcherConfiguration> configuration) | ||
{ | ||
_logger = logger ?? throw new ArgumentNullException(nameof(logger)); | ||
_configuration = configuration ?? throw new ArgumentNullException(nameof(configuration)); | ||
_searchApiRequester = searchApiRequester ?? throw new ArgumentNullException(nameof(searchApiRequester)); | ||
} | ||
|
||
private int _minStars => _configuration.Value.MinStars; | ||
private int _resultsPerPage => _configuration.Value.ResultsPerPage; | ||
private int _maxGithubResultPerQuery => _configuration.Value.MaxGitHubResultsPerQuery; | ||
|
||
/// <summary> | ||
/// Searches for all the C# repos that have more than 100 stars on GitHub, orders them in Descending order and returns them. | ||
/// </summary> | ||
/// <returns>List of C# repos on GitHub that have more than 100 stars</returns> | ||
public async Task<IReadOnlyList<RepositoryInformation>> GetPopularRepositories() | ||
{ | ||
_logger.LogInformation("Starting search on GitHub..."); | ||
var result = await GetResultsFromGitHub(); | ||
return result | ||
.GroupBy(x => x.Id) // Used to remove duplicate repos (since the GH Search API may return a result that we already had in memory) | ||
.Select(g => g.First()) | ||
.OrderByDescending(x => x.Stars) | ||
.ToList(); | ||
} | ||
|
||
private async Task CheckThrottle() | ||
{ | ||
if (_searchApiRequester.GetRemainingRequestCount() == 0) | ||
{ | ||
var sleepTime = _throttleResetTime - DateTimeOffset.Now; | ||
_throttleResetTime = DateTimeOffset.Now; | ||
if (sleepTime.TotalSeconds > 0) | ||
{ | ||
_logger.LogInformation("Waiting {TotalSeconds} seconds to cooldown.", sleepTime.TotalSeconds); | ||
await Task.Delay(sleepTime); | ||
} | ||
|
||
_logger.LogInformation("Resuming search."); | ||
} | ||
} | ||
|
||
private async Task<IReadOnlyList<RepositoryInformation>> SearchRepo(SearchRepositoriesRequest request) | ||
{ | ||
_logger.LogInformation("Requesting page {Page} for stars {Stars}", request.Page, request.Stars); | ||
|
||
bool? error = null; | ||
GitHubSearchApiResponse response = null; | ||
while (!error.HasValue || error.Value) | ||
{ | ||
try | ||
{ | ||
response = await _searchApiRequester.GetResponse(request); | ||
error = false; | ||
} | ||
catch (RateLimitExceededException) | ||
{ | ||
_logger.LogError("Exceeded GitHub RateLimit! Waiting for {LimitExceededRetryTime} before retrying.", LimitExceededRetryTime); | ||
await Task.Delay(LimitExceededRetryTime); | ||
} | ||
} | ||
|
||
if (_throttleResetTime < DateTimeOffset.Now) | ||
{ | ||
var timeToWait = response.ThrottleResetTime - response.Date; | ||
_throttleResetTime = DateTimeOffset.Now + timeToWait; | ||
} | ||
|
||
return response.Result; | ||
} | ||
|
||
private async Task<List<RepositoryInformation>> GetResultsFromGitHub() | ||
{ | ||
_throttleResetTime = DateTimeOffset.Now; | ||
var upperStarBound = int.MaxValue; | ||
var resultList = new List<RepositoryInformation>(); | ||
var lastPage = Math.Ceiling(_maxGithubResultPerQuery / (double)_resultsPerPage); | ||
|
||
while (upperStarBound >= _minStars) | ||
{ | ||
var page = 0; | ||
while (page < lastPage) | ||
{ | ||
await CheckThrottle(); | ||
|
||
var request = new SearchRepositoriesRequest | ||
{ | ||
Stars = new Range(_minStars, upperStarBound), | ||
Language = Language.CSharp, | ||
SortField = RepoSearchSort.Stars, | ||
Order = SortDirection.Descending, | ||
PerPage = _resultsPerPage, | ||
Page = page + 1 | ||
}; | ||
|
||
var response = await SearchRepo(request); | ||
|
||
if (response == null || !response.Any()) | ||
{ | ||
_logger.LogWarning("Search request didn't return any item. Page: {Page} {ConfigInfo}", request.Page, GetConfigInfo()); | ||
return resultList; | ||
} | ||
|
||
// TODO: Block unwanted repos (https://github.com/NuGet/NuGetGallery/issues/7298) | ||
resultList.AddRange(response); | ||
page++; | ||
|
||
if (page == lastPage && response.First().Stars == response.Last().Stars) | ||
{ | ||
// GitHub throttles us after a certain number of results per query. | ||
// We can only construct queries based on number of stars a repository has. | ||
// As a result, if too many repositories have the same number of stars, | ||
// we will lose data because we can't create another query that filters out the results that we have already seen with the same number of stars. | ||
_logger.LogWarning("Last page results have the same star count! This may result in missing data. StarCount: {Stars} {ConfigInfo}", | ||
response.First().Stars, | ||
GetConfigInfo()); | ||
|
||
return resultList; | ||
} | ||
} | ||
|
||
upperStarBound = resultList.Last().Stars; | ||
} | ||
|
||
return resultList; | ||
} | ||
|
||
private string GetConfigInfo() | ||
{ | ||
return $"MinStars: {_minStars}\n" + | ||
$"ResultsPerPage: {_resultsPerPage}\n" + | ||
$"MaxGithubResultPerQuery: {_maxGithubResultPerQuery}\n"; | ||
} | ||
} | ||
} |
24 changes: 24 additions & 0 deletions
24
src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHub/IGitHubSearchWrapper.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System.Threading.Tasks; | ||
using Octokit; | ||
|
||
namespace NuGet.Jobs.GitHubIndexer | ||
{ | ||
public interface IGitHubSearchWrapper | ||
{ | ||
/// <summary> | ||
/// Queries the GitHub Repo Search Api and returns its reponse | ||
/// </summary> | ||
/// <param name="request">Request to be made to the GitHub Repo Search Api</param> | ||
/// <returns>Parsed reponse of the GitHub Repo Search Api</returns> | ||
Task<GitHubSearchApiResponse> GetResponse(SearchRepositoriesRequest request); | ||
|
||
/// <summary> | ||
/// Returns the number of remaining requests before the search gets throttled | ||
/// </summary> | ||
/// <returns>Returns the number of remaining requests or null if no info is available (no request has been done yet)</returns> | ||
int? GetRemainingRequestCount(); | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/IGitRepoSearcher.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
// Copyright (c) .NET Foundation. All rights reserved. | ||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. | ||
|
||
using System.Collections.Generic; | ||
using System.Threading.Tasks; | ||
using NuGetGallery; | ||
|
||
namespace NuGet.Jobs.GitHubIndexer | ||
{ | ||
public interface IGitRepoSearcher | ||
{ | ||
/// <summary> | ||
/// Searches for all popular C# repos, orders them in Descending order and returns a list containing their basic information | ||
/// </summary> | ||
/// <returns>List of popular C# repositories</returns> | ||
Task<IReadOnlyList<RepositoryInformation>> GetPopularRepositories(); | ||
} | ||
} |
Oops, something went wrong.