Skip to content

Commit

Permalink
Refactor github updater to handle rate limits better
Browse files Browse the repository at this point in the history
  • Loading branch information
Nemo157 authored and Joshua Nelson committed Jul 5, 2020
1 parent 4c7cdb6 commit 1daffbe
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 120 deletions.
2 changes: 0 additions & 2 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
CRATESFYI_GITHUB_USERNAME=
CRATESFYI_GITHUB_ACCESSTOKEN=
CRATESFYI_PREFIX=ignored/cratesfyi-prefix
CRATESFYI_DATABASE_URL=postgresql://cratesfyi:password@localhost
RUST_LOG=cratesfyi,rustwide=info
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ zstd = "0.5"
git2 = { version = "0.13.6", default-features = false }
path-slash = "0.1.3"
once_cell = { version = "1.4.0", features = ["parking_lot"] }
base64 = "0.12.1"

# Data serialization and deserialization
serde = { version = "1.0", features = ["derive"] }
Expand Down
4 changes: 2 additions & 2 deletions src/bin/cratesfyi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,8 @@ impl DatabaseSubcommand {
}

Self::UpdateGithubFields => {
cratesfyi::utils::github_updater(&*ctx.conn()?)
.expect("Failed to update github fields");
cratesfyi::utils::GithubUpdater::new(&*ctx.config()?, ctx.pool()?)?
.update_all_crates()?;
}

Self::AddDirectory { directory, prefix } => {
Expand Down
19 changes: 18 additions & 1 deletion src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ pub struct Config {
pub(crate) max_pool_size: u32,
pub(crate) min_pool_idle: u32,

// Github authentication
pub(crate) github_username: Option<String>,
pub(crate) github_accesstoken: Option<String>,

// Max size of the files served by the docs.rs frontend
pub(crate) max_file_size: usize,
pub(crate) max_file_size_html: usize,
Expand All @@ -26,10 +30,20 @@ impl Config {
max_pool_size: env("DOCSRS_MAX_POOL_SIZE", 90)?,
min_pool_idle: env("DOCSRS_MIN_POOL_IDLE", 10)?,

github_username: maybe_env("CRATESFYI_GITHUB_USERNAME")?,
github_accesstoken: maybe_env("CRATESFYI_GITHUB_ACCESSTOKEN")?,

max_file_size: env("DOCSRS_MAX_FILE_SIZE", 50 * 1024 * 1024)?,
max_file_size_html: env("DOCSRS_MAX_FILE_SIZE_HTML", 5 * 1024 * 1024)?,
})
}

pub fn github_auth(&self) -> Option<(&str, &str)> {
Some((
self.github_username.as_deref()?,
self.github_accesstoken.as_deref()?,
))
}
}

fn env<T>(var: &str, default: T) -> Result<T, Error>
Expand Down Expand Up @@ -58,7 +72,10 @@ where
.parse::<T>()
.map(Some)
.with_context(|_| format!("failed to parse configuration variable {}", var))?),
Err(VarError::NotPresent) => Ok(None),
Err(VarError::NotPresent) => {
log::debug!("optional configuration variable {} is not set", var);
Ok(None)
}
Err(VarError::NotUnicode(_)) => bail!("configuration variable {} is not UTF-8", var),
}
}
16 changes: 6 additions & 10 deletions src/utils/daemon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use crate::{
db::Pool,
utils::{github_updater, queue_builder, update_release_activity},
utils::{queue_builder, update_release_activity, GithubUpdater},
BuildQueue, Config, DocBuilder, DocBuilderOptions,
};
use chrono::{Timelike, Utc};
Expand All @@ -21,11 +21,7 @@ pub fn start_daemon(
build_queue: Arc<BuildQueue>,
enable_registry_watcher: bool,
) -> Result<(), Error> {
const CRATE_VARIABLES: [&str; 3] = [
"CRATESFYI_PREFIX",
"CRATESFYI_GITHUB_USERNAME",
"CRATESFYI_GITHUB_ACCESSTOKEN",
];
const CRATE_VARIABLES: &[&str] = &["CRATESFYI_PREFIX"];

// first check required environment variables
for v in CRATE_VARIABLES.iter() {
Expand Down Expand Up @@ -96,13 +92,13 @@ pub fn start_daemon(
},
)?;

// update github stats every 6 hours
let cloned_db = db.clone();
// update github stats every hour
let github_updater = GithubUpdater::new(&config, db.clone())?;
cron(
"github stats updater",
Duration::from_secs(60 * 60 * 6),
Duration::from_secs(60 * 60),
move || {
github_updater(&*cloned_db.get()?)?;
github_updater.update_all_crates()?;
Ok(())
},
)?;
Expand Down
254 changes: 150 additions & 104 deletions src/utils/github_updater.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
use crate::error::Result;
use crate::{db::Pool, Config};
use chrono::{DateTime, Utc};
use failure::err_msg;
use log::debug;
use log::{debug, warn};
use postgres::Connection;
use regex::Regex;
use std::str::FromStr;
use reqwest::header::{HeaderValue, ACCEPT, AUTHORIZATION, USER_AGENT};
use serde::Deserialize;

const APP_USER_AGENT: &str = concat!(
env!("CARGO_PKG_NAME"),
" ",
include_str!(concat!(env!("OUT_DIR"), "/git_version"))
);

/// Fields we need use in cratesfyi
#[derive(Debug)]
Expand All @@ -16,115 +24,153 @@ struct GitHubFields {
last_commit: DateTime<Utc>,
}

/// Updates github fields in crates table
pub fn github_updater(conn: &Connection) -> Result<()> {
// TODO: This query assumes repository field in Cargo.toml is
// always the same across all versions of a crate
for row in &conn.query(
"SELECT DISTINCT ON (crates.name)
crates.name,
crates.id,
releases.repository_url
FROM crates
INNER JOIN releases ON releases.crate_id = crates.id
WHERE releases.repository_url ~ '^https?://github.com' AND
(crates.github_last_update < NOW() - INTERVAL '1 day' OR
crates.github_last_update IS NULL)
ORDER BY crates.name, releases.release_time DESC",
&[],
)? {
let crate_name: String = row.get(0);
let crate_id: i32 = row.get(1);
let repository_url: String = row.get(2);

if let Err(err) = get_github_path(&repository_url[..])
.ok_or_else(|| err_msg("Failed to get github path"))
.and_then(|path| get_github_fields(&path[..]))
.and_then(|fields| {
conn.execute(
"UPDATE crates
SET github_description = $1,
github_stars = $2, github_forks = $3,
github_issues = $4, github_last_commit = $5,
github_last_update = NOW()
WHERE id = $6",
&[
&fields.description,
&(fields.stars as i32),
&(fields.forks as i32),
&(fields.issues as i32),
&fields.last_commit.naive_utc(),
&crate_id,
],
)
.or_else(|e| Err(e.into()))
})
{
debug!("Failed to update github fields of: {} {}", crate_name, err);
pub struct GithubUpdater {
client: reqwest::blocking::Client,
pool: Pool,
}

impl GithubUpdater {
pub fn new(config: &Config, pool: Pool) -> Result<Self> {
let mut headers = vec![
(USER_AGENT, HeaderValue::from_static(APP_USER_AGENT)),
(ACCEPT, HeaderValue::from_static("application/json")),
];

if let Some((username, accesstoken)) = config.github_auth() {
let basicauth = format!(
"Basic {}",
base64::encode(format!("{}:{}", username, accesstoken))
);
headers.push((AUTHORIZATION, HeaderValue::from_str(&basicauth).unwrap()));
} else {
warn!("No GitHub authorization specified, will be working with very low rate limits");
}

// sleep for rate limits
use std::thread;
use std::time::Duration;
thread::sleep(Duration::from_secs(2));
let client = reqwest::blocking::Client::builder()
.default_headers(headers.into_iter().collect())
.build()?;

Ok(GithubUpdater { client, pool })
}

Ok(())
}
/// Updates github fields in crates table
pub fn update_all_crates(&self) -> Result<()> {
debug!("Starting update of all crates");

if self.is_rate_limited()? {
warn!("Skipping update because of rate limit");
return Ok(());
}

let conn = self.pool.get()?;
// TODO: This query assumes repository field in Cargo.toml is
// always the same across all versions of a crate
let rows = conn.query(
"SELECT DISTINCT ON (crates.name)
crates.name,
crates.id,
releases.repository_url
FROM crates
INNER JOIN releases ON releases.crate_id = crates.id
WHERE releases.repository_url ~ '^https?://github.com' AND
(crates.github_last_update < NOW() - INTERVAL '1 day' OR
crates.github_last_update IS NULL)
ORDER BY crates.name, releases.release_time DESC",
&[],
)?;

for row in &rows {
let crate_name: String = row.get(0);
let crate_id: i32 = row.get(1);
let repository_url: String = row.get(2);

debug!("Updating {}", crate_name);
if let Err(err) = self.update_crate(&conn, crate_id, &repository_url) {
if self.is_rate_limited()? {
warn!("Skipping remaining updates because of rate limit");
return Ok(());
}
warn!("Failed to update {}: {}", crate_name, err);
}
}

debug!("Completed all updates");
Ok(())
}

fn is_rate_limited(&self) -> Result<bool> {
#[derive(Deserialize)]
struct Response {
resources: Resources,
}

#[derive(Deserialize)]
struct Resources {
core: Resource,
}

#[derive(Deserialize)]
struct Resource {
remaining: u64,
}

let url = "https://api.github.com/rate_limit";
let response: Response = self.client.get(url).send()?.error_for_status()?.json()?;

fn get_github_fields(path: &str) -> Result<GitHubFields> {
use serde_json::Value;

let body = {
use reqwest::{blocking::Client, header::USER_AGENT, StatusCode};
use std::{env, io::Read};

let client = Client::new();
let mut body = String::new();

let mut resp = client
.get(&format!("https://api.github.com/repos/{}", path)[..])
.header(
USER_AGENT,
format!("cratesfyi/{}", env!("CARGO_PKG_VERSION")),
)
.basic_auth(
env::var("CRATESFYI_GITHUB_USERNAME")
.ok()
.unwrap_or_default(),
env::var("CRATESFYI_GITHUB_ACCESSTOKEN").ok(),
)
.send()?;

if resp.status() != StatusCode::OK {
return Err(err_msg("Failed to get github data"));
Ok(response.resources.core.remaining == 0)
}

fn update_crate(&self, conn: &Connection, crate_id: i32, repository_url: &str) -> Result<()> {
let path =
get_github_path(repository_url).ok_or_else(|| err_msg("Failed to get github path"))?;
let fields = self.get_github_fields(&path)?;

conn.execute(
"UPDATE crates
SET github_description = $1,
github_stars = $2, github_forks = $3,
github_issues = $4, github_last_commit = $5,
github_last_update = NOW()
WHERE id = $6",
&[
&fields.description,
&(fields.stars as i32),
&(fields.forks as i32),
&(fields.issues as i32),
&fields.last_commit.naive_utc(),
&crate_id,
],
)?;

Ok(())
}

fn get_github_fields(&self, path: &str) -> Result<GitHubFields> {
#[derive(Deserialize)]
struct Response {
#[serde(default)]
description: Option<String>,
#[serde(default)]
stargazers_count: i64,
#[serde(default)]
forks_count: i64,
#[serde(default)]
open_issues: i64,
#[serde(default = "Utc::now")]
pushed_at: DateTime<Utc>,
}

resp.read_to_string(&mut body)?;
body
};

let json = Value::from_str(&body[..])?;
let obj = json.as_object().unwrap();

Ok(GitHubFields {
description: obj
.get("description")
.and_then(|d| d.as_str())
.unwrap_or("")
.to_string(),
stars: obj
.get("stargazers_count")
.and_then(|d| d.as_i64())
.unwrap_or(0),
forks: obj.get("forks_count").and_then(|d| d.as_i64()).unwrap_or(0),
issues: obj.get("open_issues").and_then(|d| d.as_i64()).unwrap_or(0),
last_commit: DateTime::parse_from_rfc3339(
obj.get("pushed_at").and_then(|d| d.as_str()).unwrap_or(""),
)
.map(|datetime| datetime.with_timezone(&Utc))
.unwrap_or_else(|_| Utc::now()),
})
let url = format!("https://api.github.com/repos/{}", path);
let response: Response = self.client.get(&url).send()?.error_for_status()?.json()?;

Ok(GitHubFields {
description: response.description.unwrap_or_default(),
stars: response.stargazers_count,
forks: response.forks_count,
issues: response.open_issues,
last_commit: response.pushed_at,
})
}
}

fn get_github_path(url: &str) -> Option<String> {
Expand Down
Loading

0 comments on commit 1daffbe

Please sign in to comment.