Skip to content
This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
/ NuGet.Jobs Public archive

Commit

Permalink
[GitHub Indexer] Search for popular GitHub repositories (#770)
Browse files Browse the repository at this point in the history
* [GH Index] Initial commit

* [GH Index] Fixed build

* Added License headers

* Changed Nuspec Id

* Changed Nuspec script include

* Added empty job

* [GH Idx] Added Octokit and LibGit2Sharp dependencies

* [GH Idx] Add initial GHSearcher

* [GH Idx] Add GitRepoSearcher

* [GH Idx] Add dependency injection

* [GH Idx] Add null check

* [GH Idx] Add tests

* [GH Idx] Extracted constants

* [GH Idx] Fixed tests

* Update src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHubSearcher.cs

Co-Authored-By: Loïc Sharma <[email protected]>

* Update src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHubSearcher.cs

Co-Authored-By: Loïc Sharma <[email protected]>

* [GH Idx] Removed duplicate class RepositoryInformation

* [GH Idx] Refactored the code a bit

* [GH Idx] Fix possible deadlock

* [GH Idx] Add config section in the appsettings.json

* [GH Idx] GitHubSearcher is not recursive anymore!

* [GH Idx] Removed redundant comparer

* [GH Idx] Fix upperStarBound wrongly set on request

* [GH Idx] Fixed sleep time

* [GH Idx] Fix typo

* [GH Idx] Made fields private

* [GH Idx] Changed UA

* [GH Idx] Made the configuration not static

* [GH Idx] Add ApiInfo doc in the tests

* [GH Idx] Refactor GH Search API requester

* [GH Idx] Removed redundant import in csproj

* [GH Idx] Add documentation to the configuration

* [GH Idx] Move the IGitHubClient to the GitHubSearchWrapper

* [GH Idx] Remove redundant variable

* [GH Idx] Trim tests Assembly info

* [GH Idx] Add checks to ensure the required info is in the GitHub response

* [GH Idx] Moved public method before private methods

* [GH Idx] Extract retry time in a static variable

* [GH Idx] Add typecheck and fix tests

* [GH Idx] Remove redundant using

* [GH Idx] Nit space formatting

* [GH Idx] Change UserAgent to use assembly name and version

* [GH Idx] Remove extra line

* [GH Idx] Fix nit picks
  • Loading branch information
mogah authored Jun 25, 2019
1 parent ea6f8d0 commit 5202d2e
Show file tree
Hide file tree
Showing 15 changed files with 544 additions and 17 deletions.
11 changes: 9 additions & 2 deletions NuGet.Jobs.sln
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.28902.138
# Visual Studio 15
VisualStudioVersion = 15.0.28307.645
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NuGet.Jobs.Common", "src\NuGet.Jobs.Common\NuGet.Jobs.Common.csproj", "{4B4B1EFB-8F33-42E6-B79F-54E7F3293D31}"
EndProject
Expand Down Expand Up @@ -151,6 +151,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NuGet.Jobs.GitHubIndexer",
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TestUtil", "tests\TestUtil\TestUtil.csproj", "{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NuGet.Jobs.GitHubIndexer.Tests", "tests\NuGet.Jobs.GitHubIndexer.Tests\NuGet.Jobs.GitHubIndexer.Tests.csproj", "{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -399,6 +401,10 @@ Global
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62}.Release|Any CPU.Build.0 = Release|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -464,6 +470,7 @@ Global
{D3F1711A-25AC-4EC9-9971-4F838BCD2A07} = {6A776396-02B1-475D-A104-26940ADB04AB}
{42B1EB66-58F9-4D9A-8E23-FF12CBF5D643} = {FA5644B5-4F08-43F6-86B3-039374312A47}
{C3F84BAD-ACFA-4AE3-8286-D12F5A5BBC62} = {6A776396-02B1-475D-A104-26940ADB04AB}
{4A64FEB4-198C-445B-835F-A5B68EFBFDA7} = {6A776396-02B1-475D-A104-26940ADB04AB}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {284A7AC3-FB43-4F1F-9C9C-2AF0E1F46C2B}
Expand Down
23 changes: 23 additions & 0 deletions src/NuGet.Jobs.GitHubIndexer/GitHubSearcherConfiguration.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearcherConfiguration
{
/// <summary>
/// Minimum number of stars that a GitHub Repo needs to have to be included in the indexing
/// </summary>
public int MinStars { get; set; } = 100;

/// <summary>
/// The number of results that would be shown per page. This is currently limited to 100 (limit verified on 6/24/2019)
/// </summary>
public int ResultsPerPage { get; set; } = 100;

/// <summary>
/// The limit of results that a single search query can show. This is currently limited to 1000 (limit verified on 6/24/2019)
/// </summary>
public int MaxGitHubResultsPerQuery { get; set; } = 1000;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using NuGetGallery;

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearchApiResponse
{
public GitHubSearchApiResponse(IReadOnlyList<RepositoryInformation> result, DateTimeOffset date, DateTimeOffset throttleResetTime)
{
Result = result ?? throw new ArgumentNullException(nameof(result));
Date = date;
ThrottleResetTime = throttleResetTime;
}

public IReadOnlyList<RepositoryInformation> Result { get; }
public DateTimeOffset Date { get; }
public DateTimeOffset ThrottleResetTime { get; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using NuGetGallery;
using Octokit;

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearchWrapper : IGitHubSearchWrapper
{
private readonly IGitHubClient _client;

public GitHubSearchWrapper(IGitHubClient client)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
}

public int? GetRemainingRequestCount()
{
var apiInfo = _client.GetLastApiInfo();
return apiInfo?.RateLimit.Remaining;
}

public async Task<GitHubSearchApiResponse> GetResponse(SearchRepositoriesRequest request)
{
var apiResponse = await _client.Connection.Get<SearchRepositoryResult>(ApiUrls.SearchRepositories(), request.Parameters, null);
if (!apiResponse.HttpResponse.Headers.TryGetValue("Date", out var ghStrDate)
|| !DateTime.TryParseExact(ghStrDate, "ddd',' dd MMM yyyy HH:mm:ss 'GMT'", CultureInfo.InvariantCulture, DateTimeStyles.None, out var ghTime))
{
throw new InvalidDataException("Date is required to compute the throttling time.");
}

if (!apiResponse.HttpResponse.Headers.TryGetValue("X-RateLimit-Reset", out var ghStrResetLimit)
|| !long.TryParse(ghStrResetLimit, out var ghResetTime))
{
throw new InvalidDataException("X-RateLimit-Reset is required to compute the throttling time.");
}

return new GitHubSearchApiResponse(
apiResponse.Body.Items
.Select(repo => new RepositoryInformation(
$"{repo.Owner.Login}/{repo.Name}",
repo.HtmlUrl,
repo.StargazersCount,
Array.Empty<string>())).ToList(),
ghTime.ToLocalTime(),
DateTimeOffset.FromUnixTimeSeconds(ghResetTime).ToLocalTime());
}
}
}
162 changes: 162 additions & 0 deletions src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/GitHub/GitHubSearcher.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using NuGetGallery;
using Octokit;

namespace NuGet.Jobs.GitHubIndexer
{
public class GitHubSearcher : IGitRepoSearcher
{
private static readonly TimeSpan LimitExceededRetryTime = TimeSpan.FromSeconds(5);

private readonly ILogger<GitHubSearcher> _logger;
private readonly IOptionsSnapshot<GitHubSearcherConfiguration> _configuration;
private readonly IGitHubSearchWrapper _searchApiRequester;

private DateTimeOffset _throttleResetTime;

public GitHubSearcher(
IGitHubSearchWrapper searchApiRequester,
ILogger<GitHubSearcher> logger,
IOptionsSnapshot<GitHubSearcherConfiguration> configuration)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_configuration = configuration ?? throw new ArgumentNullException(nameof(configuration));
_searchApiRequester = searchApiRequester ?? throw new ArgumentNullException(nameof(searchApiRequester));
}

private int _minStars => _configuration.Value.MinStars;
private int _resultsPerPage => _configuration.Value.ResultsPerPage;
private int _maxGithubResultPerQuery => _configuration.Value.MaxGitHubResultsPerQuery;

/// <summary>
/// Searches for all the C# repos that have more than 100 stars on GitHub, orders them in Descending order and returns them.
/// </summary>
/// <returns>List of C# repos on GitHub that have more than 100 stars</returns>
public async Task<IReadOnlyList<RepositoryInformation>> GetPopularRepositories()
{
_logger.LogInformation("Starting search on GitHub...");
var result = await GetResultsFromGitHub();
return result
.GroupBy(x => x.Id) // Used to remove duplicate repos (since the GH Search API may return a result that we already had in memory)
.Select(g => g.First())
.OrderByDescending(x => x.Stars)
.ToList();
}

private async Task CheckThrottle()
{
if (_searchApiRequester.GetRemainingRequestCount() == 0)
{
var sleepTime = _throttleResetTime - DateTimeOffset.Now;
_throttleResetTime = DateTimeOffset.Now;
if (sleepTime.TotalSeconds > 0)
{
_logger.LogInformation("Waiting {TotalSeconds} seconds to cooldown.", sleepTime.TotalSeconds);
await Task.Delay(sleepTime);
}

_logger.LogInformation("Resuming search.");
}
}

private async Task<IReadOnlyList<RepositoryInformation>> SearchRepo(SearchRepositoriesRequest request)
{
_logger.LogInformation("Requesting page {Page} for stars {Stars}", request.Page, request.Stars);

bool? error = null;
GitHubSearchApiResponse response = null;
while (!error.HasValue || error.Value)
{
try
{
response = await _searchApiRequester.GetResponse(request);
error = false;
}
catch (RateLimitExceededException)
{
_logger.LogError("Exceeded GitHub RateLimit! Waiting for {LimitExceededRetryTime} before retrying.", LimitExceededRetryTime);
await Task.Delay(LimitExceededRetryTime);
}
}

if (_throttleResetTime < DateTimeOffset.Now)
{
var timeToWait = response.ThrottleResetTime - response.Date;
_throttleResetTime = DateTimeOffset.Now + timeToWait;
}

return response.Result;
}

private async Task<List<RepositoryInformation>> GetResultsFromGitHub()
{
_throttleResetTime = DateTimeOffset.Now;
var upperStarBound = int.MaxValue;
var resultList = new List<RepositoryInformation>();
var lastPage = Math.Ceiling(_maxGithubResultPerQuery / (double)_resultsPerPage);

while (upperStarBound >= _minStars)
{
var page = 0;
while (page < lastPage)
{
await CheckThrottle();

var request = new SearchRepositoriesRequest
{
Stars = new Range(_minStars, upperStarBound),
Language = Language.CSharp,
SortField = RepoSearchSort.Stars,
Order = SortDirection.Descending,
PerPage = _resultsPerPage,
Page = page + 1
};

var response = await SearchRepo(request);

if (response == null || !response.Any())
{
_logger.LogWarning("Search request didn't return any item. Page: {Page} {ConfigInfo}", request.Page, GetConfigInfo());
return resultList;
}

// TODO: Block unwanted repos (https://github.com/NuGet/NuGetGallery/issues/7298)
resultList.AddRange(response);
page++;

if (page == lastPage && response.First().Stars == response.Last().Stars)
{
// GitHub throttles us after a certain number of results per query.
// We can only construct queries based on number of stars a repository has.
// As a result, if too many repositories have the same number of stars,
// we will lose data because we can't create another query that filters out the results that we have already seen with the same number of stars.
_logger.LogWarning("Last page results have the same star count! This may result in missing data. StarCount: {Stars} {ConfigInfo}",
response.First().Stars,
GetConfigInfo());

return resultList;
}
}

upperStarBound = resultList.Last().Stars;
}

return resultList;
}

private string GetConfigInfo()
{
return $"MinStars: {_minStars}\n" +
$"ResultsPerPage: {_resultsPerPage}\n" +
$"MaxGithubResultPerQuery: {_maxGithubResultPerQuery}\n";
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Threading.Tasks;
using Octokit;

namespace NuGet.Jobs.GitHubIndexer
{
public interface IGitHubSearchWrapper
{
/// <summary>
/// Queries the GitHub Repo Search Api and returns its reponse
/// </summary>
/// <param name="request">Request to be made to the GitHub Repo Search Api</param>
/// <returns>Parsed reponse of the GitHub Repo Search Api</returns>
Task<GitHubSearchApiResponse> GetResponse(SearchRepositoriesRequest request);

/// <summary>
/// Returns the number of remaining requests before the search gets throttled
/// </summary>
/// <returns>Returns the number of remaining requests or null if no info is available (no request has been done yet)</returns>
int? GetRemainingRequestCount();
}
}
18 changes: 18 additions & 0 deletions src/NuGet.Jobs.GitHubIndexer/GitRepoSearchers/IGitRepoSearcher.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.Collections.Generic;
using System.Threading.Tasks;
using NuGetGallery;

namespace NuGet.Jobs.GitHubIndexer
{
public interface IGitRepoSearcher
{
/// <summary>
/// Searches for all popular C# repos, orders them in Descending order and returns a list containing their basic information
/// </summary>
/// <returns>List of popular C# repositories</returns>
Task<IReadOnlyList<RepositoryInformation>> GetPopularRepositories();
}
}
Loading

0 comments on commit 5202d2e

Please sign in to comment.