From c559add63e67ddc2914965d8b69d30b04b5971d2 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Thu, 9 Mar 2023 20:48:59 +0100 Subject: [PATCH 01/21] fetch publisher data from crates.io, start build artifact caching --- docker-compose.yml | 3 + src/db/add_package.rs | 21 ++- src/db/delete.rs | 8 +- src/docbuilder/caching.rs | 112 ++++++++++++++ src/docbuilder/mod.rs | 1 + src/docbuilder/rustwide_builder.rs | 52 ++++++- src/index/api.rs | 235 +++++++++++++++++++++-------- src/test/fakes.rs | 10 +- src/web/crate_details.rs | 17 ++- src/web/releases.rs | 5 +- 10 files changed, 377 insertions(+), 87 deletions(-) create mode 100644 src/docbuilder/caching.rs diff --git a/docker-compose.yml b/docker-compose.yml index 36180aa3d..195c2b788 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,7 @@ services: - "/var/run/docker.sock:/var/run/docker.sock" - ".rustwide-docker:/opt/docsrs/rustwide" - "cratesio-index:/opt/docsrs/prefix/crates.io-index" + - "artifact-cache:/opt/docsrs/prefix/artifact_cache" environment: DOCSRS_RUSTWIDE_WORKSPACE: /opt/docsrs/rustwide DOCSRS_DATABASE_URL: postgresql://cratesfyi:password@db @@ -23,6 +24,7 @@ services: S3_ENDPOINT: http://s3:9000 AWS_ACCESS_KEY_ID: cratesfyi AWS_SECRET_ACCESS_KEY: secret_key + DOCSRS_PREFIX: /opt/docsrs/prefix env_file: - .env healthcheck: @@ -94,3 +96,4 @@ volumes: postgres-data: {} minio-data: {} cratesio-index: {} + artifact-cache: {} diff --git a/src/db/add_package.rs b/src/db/add_package.rs index 1ba775c9a..1af14827c 100644 --- a/src/db/add_package.rs +++ b/src/db/add_package.rs @@ -2,7 +2,7 @@ use crate::{ db::types::Feature, docbuilder::{BuildResult, DocCoverage}, error::Result, - index::api::{CrateData, CrateOwner, ReleaseData}, + index::api::{CrateData, GithubUser, ReleaseData}, storage::CompressionAlgorithm, utils::MetadataPackage, web::crate_details::CrateDetails, @@ -371,7 +371,7 @@ pub fn update_crate_data_in_database( /// Adds owners into database fn update_owners_in_database( conn: &mut Client, - owners: &[CrateOwner], + owners: &[GithubUser], crate_id: i32, ) -> Result<()> { // Update any existing owner data since it is mutable and could have changed since last @@ -562,9 +562,10 @@ mod test { }, )?; - let owner1 = CrateOwner { + let owner1 = GithubUser { avatar: "avatar".into(), login: "login".into(), + ..Default::default() }; update_owners_in_database(&mut conn, &[owner1.clone()], crate_id)?; @@ -600,16 +601,18 @@ mod test { // set initial owner details update_owners_in_database( &mut conn, - &[CrateOwner { + &[GithubUser { login: "login".into(), avatar: "avatar".into(), + ..Default::default() }], crate_id, )?; - let updated_owner = CrateOwner { + let updated_owner = GithubUser { login: "login".into(), avatar: "avatar2".into(), + ..Default::default() }; update_owners_in_database(&mut conn, &[updated_owner.clone()], crate_id)?; @@ -645,17 +648,19 @@ mod test { // set initial owner details update_owners_in_database( &mut conn, - &[CrateOwner { + &[GithubUser { login: "login".into(), avatar: "avatar".into(), + ..Default::default() }], crate_id, )?; - let new_owners: Vec = (1..5) - .map(|i| CrateOwner { + let new_owners: Vec = (1..5) + .map(|i| GithubUser { login: format!("login{i}"), avatar: format!("avatar{i}"), + ..Default::default() }) .collect(); diff --git a/src/db/delete.rs b/src/db/delete.rs index 840459dcd..d8045c159 100644 --- a/src/db/delete.rs +++ b/src/db/delete.rs @@ -187,7 +187,7 @@ fn delete_crate_from_database(conn: &mut Client, name: &str, crate_id: i32) -> R #[cfg(test)] mod tests { use super::*; - use crate::index::api::CrateOwner; + use crate::index::api::GithubUser; use crate::test::{assert_success, wrapper}; use postgres::Client; use test_case::test_case; @@ -313,9 +313,10 @@ mod tests { .name("a") .version("1.0.0") .archive_storage(archive_storage) - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "malicious actor".into(), avatar: "https://example.org/malicious".into(), + ..Default::default() }) .create()?; assert!(release_exists(&mut db.conn(), v1)?); @@ -342,9 +343,10 @@ mod tests { .name("a") .version("2.0.0") .archive_storage(archive_storage) - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "Peter Rabbit".into(), avatar: "https://example.org/peter".into(), + ..Default::default() }) .create()?; assert!(release_exists(&mut db.conn(), v2)?); diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs new file mode 100644 index 000000000..19426cbcc --- /dev/null +++ b/src/docbuilder/caching.rs @@ -0,0 +1,112 @@ +use crate::utils::copy_dir_all; +use anyhow::{Context as _, Result}; +use std::{ + fs, io, + path::{Path, PathBuf}, +}; +use tracing::{debug, instrument, warn}; + +/// move cache folder to target, falling back to copy + delete on error. +fn move_or_copy + std::fmt::Debug, Q: AsRef + std::fmt::Debug>( + source: P, + dest: Q, +) -> io::Result<()> { + if let Some(parent) = dest.as_ref().parent() { + fs::create_dir_all(parent)?; + } + if let Err(err) = fs::rename(&source, &dest) { + warn!( + ?err, + ?source, + ?dest, + "could not move target directory, fall back to copy" + ); + copy_dir_all(&source, &dest)?; + fs::remove_dir_all(&source)?; + } + Ok(()) +} + +/// artifact caching with cleanup +#[derive(Debug)] +pub(crate) struct ArtifactCache { + cache_dir: PathBuf, +} + +impl ArtifactCache { + pub(crate) fn new(cache_dir: PathBuf) -> Result { + Ok(Self { cache_dir }) + } + + pub(crate) fn purge(&self) -> Result<()> { + fs::remove_dir_all(&self.cache_dir)?; + Ok(()) + } + + /// clean up a target directory. + /// + /// Should delete all things that shouldn't leak between + /// builds, so: + /// - doc-output + /// - ...? + #[instrument(skip(self))] + fn cleanup(&self, target_dir: &Path) -> Result<()> { + for item in fs::read_dir(target_dir)? { + // the first level of directories are the targets in most cases, + // delete their doc-directories + let item = item?; + let doc_dir = item.path().join("doc"); + if doc_dir.is_dir() { + debug!(?doc_dir, "cache dir cleanup"); + fs::remove_dir_all(doc_dir)?; + } + } + Ok(()) + } + + /// restore a cached target directory. + /// + /// Will just move the cache folder into the rustwide + /// target path. If that fails, will use `copy_dir_all`. + #[instrument(skip(self))] + pub(crate) fn restore_to + std::fmt::Debug>( + &self, + cache_key: &str, + target_dir: P, + ) -> Result<()> { + let target_dir = target_dir.as_ref(); + if target_dir.exists() { + // to be safe, while most of the time the dir doesn't exist, + // or is empty. + fs::remove_dir_all(target_dir).context("could not clean target directory")?; + } + + let cache_dir = self.cache_dir.join(cache_key); + if !cache_dir.exists() { + // when there is no existing cache dir, + // we can just create an empty target. + fs::create_dir_all(target_dir).context("could not create target directory")?; + return Ok(()); + } + + move_or_copy(cache_dir, target_dir).context("could not move cache directory to target")?; + Ok(()) + } + + #[instrument(skip(self))] + pub(crate) fn save + std::fmt::Debug>( + &self, + cache_key: &str, + target_dir: P, + ) -> Result<()> { + let cache_dir = self.cache_dir.join(cache_key); + if !cache_dir.exists() { + fs::create_dir_all(&cache_dir)?; + } + + move_or_copy(&target_dir, &cache_dir) + .context("could not move target directory to cache")?; + self.cleanup(&cache_dir)?; + Ok(()) + } +} diff --git a/src/docbuilder/mod.rs b/src/docbuilder/mod.rs index 6eadd9295..228c626fd 100644 --- a/src/docbuilder/mod.rs +++ b/src/docbuilder/mod.rs @@ -1,3 +1,4 @@ +mod caching; mod crates; mod limits; mod rustwide_builder; diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index d3b519bcf..4c27be8d8 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -1,3 +1,4 @@ +use super::caching::ArtifactCache; use crate::db::file::add_path_into_database; use crate::db::{ add_build_into_database, add_doc_coverage, add_package_into_database, @@ -27,13 +28,14 @@ use rustwide::{AlternativeRegistry, Build, Crate, Toolchain, Workspace, Workspac use std::collections::{HashMap, HashSet}; use std::path::Path; use std::sync::Arc; -use tracing::{debug, info, warn}; +use tracing::{debug, info, instrument, warn}; const USER_AGENT: &str = "docs.rs builder (https://github.com/rust-lang/docs.rs)"; const COMPONENTS: &[&str] = &["llvm-tools-preview", "rustc-dev", "rustfmt"]; const DUMMY_CRATE_NAME: &str = "empty-library"; const DUMMY_CRATE_VERSION: &str = "1.0.0"; +#[derive(Debug)] pub enum PackageKind<'a> { Local(&'a Path), CratesIo, @@ -48,6 +50,7 @@ pub struct RustwideBuilder { storage: Arc, metrics: Arc, index: Arc, + artifact_cache: ArtifactCache, rustc_version: String, repository_stats_updater: Arc, skip_build_if_exists: bool, @@ -90,6 +93,7 @@ impl RustwideBuilder { Ok(RustwideBuilder { workspace, toolchain, + artifact_cache: ArtifactCache::new(config.prefix.join("artifact_cache"))?, config, db: context.pool()?, storage: context.storage()?, @@ -200,6 +204,7 @@ impl RustwideBuilder { let has_changed = old_version.as_deref() != Some(&self.rustc_version); if has_changed { + self.artifact_cache.purge()?; self.add_essential_files()?; } Ok(has_changed) @@ -322,6 +327,7 @@ impl RustwideBuilder { self.build_package(&package.name, &package.version, PackageKind::Local(path)) } + #[instrument(skip(self))] pub fn build_package( &mut self, name: &str, @@ -386,6 +392,34 @@ impl RustwideBuilder { (|| -> Result { use docsrs_metadata::BuildTargets; + let release_data = match self + .index + .api() + .get_release_data(name, version) + .context("error fetching release data from crates.io") + { + Ok(data) => data, + Err(err) => { + warn!("{:#?}", err); + ReleaseData::default() + } + }; + + if let Some(ref published_by) = release_data.published_by { + info!( + host_target_dir=?build.host_target_dir(), + published_by_id=published_by.id, + published_by_login=published_by.login, + "restoring artifact cache", + ); + if let Err(err) = self + .artifact_cache + .restore_to(&published_by.id.to_string(), build.host_target_dir()) + { + warn!(?err, "could not restore artifact cache"); + } + } + let mut has_docs = false; let mut successful_targets = Vec::new(); let metadata = Metadata::from_crate_root(build.host_source_dir())?; @@ -537,6 +571,22 @@ impl RustwideBuilder { Err(err) => warn!("{:#?}", err), } + if let Some(ref published_by) = release_data.published_by { + info!( + host_target_dir=?build.host_target_dir(), + published_by_id=published_by.id, + published_by_login=published_by.login, + "saving artifact cache", + ); + if let Err(err) = self + .artifact_cache + .save(&published_by.id.to_string(), build.host_target_dir()) + .context("error giving back artifact cache") + { + warn!(?err, "could not give back artifact cache"); + }; + } + if res.result.successful { // delete eventually existing files from pre-archive storage. // we're doing this in the end so eventual problems in the build diff --git a/src/index/api.rs b/src/index/api.rs index 62e76f280..f6293581f 100644 --- a/src/index/api.rs +++ b/src/index/api.rs @@ -12,7 +12,6 @@ const APP_USER_AGENT: &str = concat!( include_str!(concat!(env!("OUT_DIR"), "/git_version")) ); -#[derive(Debug)] pub struct Api { api_base: Option, max_retries: u32, @@ -21,14 +20,15 @@ pub struct Api { #[derive(Debug)] pub struct CrateData { - pub(crate) owners: Vec, + pub(crate) owners: Vec, } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub(crate) struct ReleaseData { pub(crate) release_time: DateTime, pub(crate) yanked: bool, pub(crate) downloads: i32, + pub(crate) published_by: Option, } impl Default for ReleaseData { @@ -37,13 +37,31 @@ impl Default for ReleaseData { release_time: Utc::now(), yanked: false, downloads: 0, + published_by: None, } } } -#[derive(Debug, Clone)] -pub struct CrateOwner { +#[derive(Deserialize)] +struct VersionData { + num: Version, + #[serde(default = "Utc::now")] + created_at: DateTime, + #[serde(default)] + yanked: bool, + #[serde(default)] + downloads: i32, + #[serde(default)] + published_by: Option, +} + +#[derive(Debug, Clone, PartialEq, Deserialize, Default)] +pub struct GithubUser { + #[serde(default)] + pub(crate) id: u64, + #[serde(default)] pub(crate) avatar: String, + #[serde(default)] pub(crate) login: String, } @@ -82,23 +100,24 @@ impl Api { } pub(crate) fn get_release_data(&self, name: &str, version: &str) -> Result { - let (release_time, yanked, downloads) = self - .get_release_time_yanked_downloads(name, version) - .context(format!("Failed to get crate data for {name}-{version}"))?; + let version = Version::parse(version)?; + let data = self + .get_versions(name) + .context(format!("Failed to get crate data for {name}-{version}"))? + .into_iter() + .find(|data| data.num == version) + .with_context(|| anyhow!("Could not find version in response"))?; Ok(ReleaseData { - release_time, - yanked, - downloads, + release_time: data.created_at, + yanked: data.yanked, + downloads: data.downloads, + published_by: data.published_by, }) } /// Get release_time, yanked and downloads from the registry's API - fn get_release_time_yanked_downloads( - &self, - name: &str, - version: &str, - ) -> Result<(DateTime, bool, i32)> { + fn get_versions(&self, name: &str) -> Result> { let url = { let mut url = self.api_base()?; url.path_segments_mut() @@ -112,35 +131,17 @@ impl Api { versions: Vec, } - #[derive(Deserialize)] - struct VersionData { - num: Version, - #[serde(default = "Utc::now")] - created_at: DateTime, - #[serde(default)] - yanked: bool, - #[serde(default)] - downloads: i32, - } - let response: Response = retry( || Ok(self.client.get(url.clone()).send()?.error_for_status()?), self.max_retries, )? .json()?; - let version = Version::parse(version)?; - let version = response - .versions - .into_iter() - .find(|data| data.num == version) - .with_context(|| anyhow!("Could not find version in response"))?; - - Ok((version.created_at, version.yanked, version.downloads)) + Ok(response.versions) } /// Fetch owners from the registry's API - fn get_owners(&self, name: &str) -> Result> { + fn get_owners(&self, name: &str) -> Result> { let url = { let mut url = self.api_base()?; url.path_segments_mut() @@ -151,15 +152,7 @@ impl Api { #[derive(Deserialize)] struct Response { - users: Vec, - } - - #[derive(Deserialize)] - struct OwnerData { - #[serde(default)] - avatar: Option, - #[serde(default)] - login: Option, + users: Vec, } let response: Response = retry( @@ -168,22 +161,144 @@ impl Api { )? .json()?; - let result = response + Ok(response .users .into_iter() - .filter(|data| { - !data - .login - .as_ref() - .map(|login| login.is_empty()) - .unwrap_or_default() - }) - .map(|data| CrateOwner { - avatar: data.avatar.unwrap_or_default(), - login: data.login.unwrap_or_default(), - }) - .collect(); - - Ok(result) + .filter(|data| !data.login.is_empty()) + .collect()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use mockito::mock; + use serde_json::{self, json}; + + fn api() -> Api { + Api::new(Some(Url::parse(&mockito::server_url()).unwrap())).unwrap() + } + + #[test] + fn get_owners() { + let _m = mock("GET", "/api/v1/crates/krate/owners") + .with_header("content-type", "application/json") + .with_body( + serde_json::to_string(&json!({ + "users": [ + { + "id": 1, + "login": "the_first_owner", + "name": "name", + "avatar": "http://something" + }, + { + "id": 2, + "login": "the_second_owner", + "name": "another name", + "avatar": "http://anotherthing" + } + ]})) + .unwrap(), + ) + .create(); + + assert_eq!( + api().get_owners("krate").unwrap(), + vec![ + GithubUser { + id: 1, + avatar: "http://something".into(), + login: "the_first_owner".into() + }, + GithubUser { + id: 2, + avatar: "http://anotherthing".into(), + login: "the_second_owner".into() + } + ] + ); + } + + #[test] + fn get_release_info() { + let created = Utc::now(); + let _m = mock("GET", "/api/v1/crates/krate/versions") + .with_header("content-type", "application/json") + .with_body( + serde_json::to_string(&json!({ + "versions": [ + { + "num": "1.2.3", + "created_at": created.to_rfc3339(), + "yanked": true, + "downloads": 223, + "published_by": { + "id": 2, + "login": "the_second_owner", + "name": "another name", + "avatar": "http://anotherthing" + } + }, + { + "num": "2.2.3", + "created_at": Utc::now().to_rfc3339(), + "yanked": false, + "downloads": 333, + "published_by": { + "id": 1, + "login": "owner", + "name": "name", + } + } + ]})) + .unwrap(), + ) + .create(); + + assert_eq!( + api().get_release_data("krate", "1.2.3").unwrap(), + ReleaseData { + release_time: created, + yanked: true, + downloads: 223, + published_by: Some(GithubUser { + id: 2, + avatar: "http://anotherthing".into(), + login: "the_second_owner".into(), + }) + } + ); + } + + #[test] + fn get_release_info_without_publisher() { + let created = Utc::now(); + let _m = mock("GET", "/api/v1/crates/krate/versions") + .with_header("content-type", "application/json") + .with_body( + serde_json::to_string(&json!({ + "versions": [ + { + "num": "1.2.3", + "created_at": created.to_rfc3339(), + "yanked": true, + "downloads": 223, + "published_by": null + }, + ]})) + .unwrap(), + ) + .create(); + + assert_eq!( + api().get_release_data("krate", "1.2.3").unwrap(), + ReleaseData { + release_time: created, + yanked: true, + downloads: 223, + published_by: None, + } + ); } } diff --git a/src/test/fakes.rs b/src/test/fakes.rs index 5d2583972..4da8ff442 100644 --- a/src/test/fakes.rs +++ b/src/test/fakes.rs @@ -2,7 +2,7 @@ use super::TestDatabase; use crate::docbuilder::{BuildResult, DocCoverage}; use crate::error::Result; -use crate::index::api::{CrateData, CrateOwner, ReleaseData}; +use crate::index::api::{CrateData, GithubUser, ReleaseData}; use crate::storage::{rustdoc_archive_path, source_archive_path, Storage}; use crate::utils::{Dependency, MetadataPackage, Target}; use anyhow::Context; @@ -85,11 +85,7 @@ impl<'a> FakeRelease<'a> { doc_targets: Vec::new(), default_target: None, registry_crate_data: CrateData { owners: Vec::new() }, - registry_release_data: ReleaseData { - release_time: Utc::now(), - yanked: false, - downloads: 0, - }, + registry_release_data: ReleaseData::default(), has_docs: true, has_examples: false, readme: None, @@ -214,7 +210,7 @@ impl<'a> FakeRelease<'a> { self.source_file("README.md", content.as_bytes()) } - pub(crate) fn add_owner(mut self, owner: CrateOwner) -> Self { + pub(crate) fn add_owner(mut self, owner: GithubUser) -> Self { self.registry_crate_data.owners.push(owner); self } diff --git a/src/web/crate_details.rs b/src/web/crate_details.rs index ffdd2ef74..94fcd28e8 100644 --- a/src/web/crate_details.rs +++ b/src/web/crate_details.rs @@ -423,7 +423,7 @@ pub(crate) async fn get_all_releases( #[cfg(test)] mod tests { use super::*; - use crate::index::api::CrateOwner; + use crate::index::api::GithubUser; use crate::test::{ assert_cache_control, assert_redirect, assert_redirect_cached, wrapper, TestDatabase, }; @@ -820,9 +820,10 @@ mod tests { env.fake_release() .name("foo") .version("0.0.1") - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "foobar".into(), avatar: "https://example.org/foobar".into(), + ..Default::default() }) .create()?; @@ -838,13 +839,15 @@ mod tests { env.fake_release() .name("foo") .version("0.0.2") - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "foobar".into(), avatar: "https://example.org/foobarv2".into(), + ..Default::default() }) - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "barfoo".into(), avatar: "https://example.org/barfoo".into(), + ..Default::default() }) .create()?; @@ -865,9 +868,10 @@ mod tests { env.fake_release() .name("foo") .version("0.0.3") - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "barfoo".into(), avatar: "https://example.org/barfoo".into(), + ..Default::default() }) .create()?; @@ -883,9 +887,10 @@ mod tests { env.fake_release() .name("bar") .version("0.0.1") - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "barfoo".into(), avatar: "https://example.org/barfoov2".into(), + ..Default::default() }) .create()?; diff --git a/src/web/releases.rs b/src/web/releases.rs index f97f90556..ba0a5250b 100644 --- a/src/web/releases.rs +++ b/src/web/releases.rs @@ -749,7 +749,7 @@ pub(crate) async fn build_queue_handler( #[cfg(test)] mod tests { use super::*; - use crate::index::api::CrateOwner; + use crate::index::api::GithubUser; use crate::test::{ assert_redirect, assert_redirect_unchecked, assert_success, wrapper, TestFrontend, }; @@ -1473,9 +1473,10 @@ mod tests { let web = env.frontend(); env.fake_release() .name("some_random_crate") - .add_owner(CrateOwner { + .add_owner(GithubUser { login: "foobar".into(), avatar: "https://example.org/foobar".into(), + ..Default::default() }) .create()?; From fa1981f6c486cb15c410ee357fd836e03c29992d Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 10 Mar 2023 16:55:06 +0100 Subject: [PATCH 02/21] update docker-compose config so rustwide & prefix are on the same filesystem --- docker-compose.yml | 17 ++++++++++------- dockerfiles/entrypoint.sh | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 195c2b788..687d19cac 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,17 +14,21 @@ services: expose: ["3000"] volumes: - "/var/run/docker.sock:/var/run/docker.sock" - - ".rustwide-docker:/opt/docsrs/rustwide" - - "cratesio-index:/opt/docsrs/prefix/crates.io-index" - - "artifact-cache:/opt/docsrs/prefix/artifact_cache" + # we only need one of the two mappings here. + # this will link to your local filesystem, which makes build debugging + # easier + - "./ignored/opt-docsrs:/opt/docsrs_ext/" + # this uses a docker volume and will be faster, but you need to + # log into the container to debug its contents + # - "prefix-ext:/opt/docsrs_ext/" environment: - DOCSRS_RUSTWIDE_WORKSPACE: /opt/docsrs/rustwide + DOCSRS_RUSTWIDE_WORKSPACE: /opt/docsrs_ext/rustwide + DOCSRS_PREFIX: /opt/docsrs_ext/prefix DOCSRS_DATABASE_URL: postgresql://cratesfyi:password@db DOCSRS_STORAGE_BACKEND: s3 S3_ENDPOINT: http://s3:9000 AWS_ACCESS_KEY_ID: cratesfyi AWS_SECRET_ACCESS_KEY: secret_key - DOCSRS_PREFIX: /opt/docsrs/prefix env_file: - .env healthcheck: @@ -95,5 +99,4 @@ services: volumes: postgres-data: {} minio-data: {} - cratesio-index: {} - artifact-cache: {} + prefix-ext: {} diff --git a/dockerfiles/entrypoint.sh b/dockerfiles/entrypoint.sh index b8fdc61aa..3f19cb03d 100755 --- a/dockerfiles/entrypoint.sh +++ b/dockerfiles/entrypoint.sh @@ -2,7 +2,7 @@ set -euv -export DOCSRS_PREFIX=/opt/docsrs/prefix +export DOCSRS_PREFIX=${DOCSRS_PREFIX:-"/opt/docsrs/prefix"} export DOCSRS_DOCKER=true export DOCSRS_LOG=${DOCSRS_LOG-"docs-rs,rustwide=info"} export PATH="$PATH:/build/target/release" From ef74011a2ab70e42c9ed3834e175eac06e55894b Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 10 Mar 2023 16:55:22 +0100 Subject: [PATCH 03/21] always move the cache / targets, no copy needed --- src/docbuilder/caching.rs | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index 19426cbcc..1f4dfc95b 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -1,32 +1,10 @@ -use crate::utils::copy_dir_all; use anyhow::{Context as _, Result}; use std::{ - fs, io, + fs, path::{Path, PathBuf}, }; use tracing::{debug, instrument, warn}; -/// move cache folder to target, falling back to copy + delete on error. -fn move_or_copy + std::fmt::Debug, Q: AsRef + std::fmt::Debug>( - source: P, - dest: Q, -) -> io::Result<()> { - if let Some(parent) = dest.as_ref().parent() { - fs::create_dir_all(parent)?; - } - if let Err(err) = fs::rename(&source, &dest) { - warn!( - ?err, - ?source, - ?dest, - "could not move target directory, fall back to copy" - ); - copy_dir_all(&source, &dest)?; - fs::remove_dir_all(&source)?; - } - Ok(()) -} - /// artifact caching with cleanup #[derive(Debug)] pub(crate) struct ArtifactCache { @@ -89,7 +67,7 @@ impl ArtifactCache { return Ok(()); } - move_or_copy(cache_dir, target_dir).context("could not move cache directory to target")?; + fs::rename(cache_dir, target_dir).context("could not move cache directory to target")?; Ok(()) } @@ -100,12 +78,12 @@ impl ArtifactCache { target_dir: P, ) -> Result<()> { let cache_dir = self.cache_dir.join(cache_key); - if !cache_dir.exists() { - fs::create_dir_all(&cache_dir)?; + if cache_dir.exists() { + fs::remove_dir_all(&cache_dir)?; } - move_or_copy(&target_dir, &cache_dir) - .context("could not move target directory to cache")?; + debug!(?target_dir, ?cache_dir, "saving artifact cache"); + fs::rename(&target_dir, &cache_dir).context("could not move target directory to cache")?; self.cleanup(&cache_dir)?; Ok(()) } From 3fdf1fd26f81185ea1caeb96246f1fa4cbb20543 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 10 Mar 2023 17:00:37 +0100 Subject: [PATCH 04/21] also clean doc-output for proc macro crates --- src/docbuilder/caching.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index 1f4dfc95b..3a6a20ee8 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -29,6 +29,14 @@ impl ArtifactCache { /// - ...? #[instrument(skip(self))] fn cleanup(&self, target_dir: &Path) -> Result<()> { + // proc-macro crates have a `doc` directory + // directly in the target. + let doc_dir = target_dir.join("doc"); + if doc_dir.is_dir() { + debug!(?doc_dir, "cache dir cleanup"); + fs::remove_dir_all(doc_dir)?; + } + for item in fs::read_dir(target_dir)? { // the first level of directories are the targets in most cases, // delete their doc-directories From a82e6dcd461be76bee146ba075837de3b3a9a415 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 10 Mar 2023 17:03:52 +0100 Subject: [PATCH 05/21] adapt comment --- src/docbuilder/caching.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index 3a6a20ee8..4a3f6a49c 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -62,8 +62,8 @@ impl ArtifactCache { ) -> Result<()> { let target_dir = target_dir.as_ref(); if target_dir.exists() { - // to be safe, while most of the time the dir doesn't exist, - // or is empty. + // Delete the target dir to be safe, even though most of the + // time the dir doesn't exist or is empty. fs::remove_dir_all(target_dir).context("could not clean target directory")?; } From bf8440b2ca952506007d60a20e4629beb7c2e438 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sat, 11 Mar 2023 18:10:10 +0100 Subject: [PATCH 06/21] add freeing disk space functionliaty --- Cargo.lock | 24 +++++++++ Cargo.toml | 1 + src/docbuilder/caching.rs | 110 +++++++++++++++++++++++++++++++++++--- 3 files changed, 127 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6847b73a4..14f801586 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1539,6 +1539,7 @@ dependencies = [ "string_cache_codegen", "strum", "syntect", + "sysinfo", "tempfile", "tera", "test-case", @@ -3334,6 +3335,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "ntapi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc51db7b362b205941f71232e56c625156eb9a929f8cf74a428fd5bc094a4afc" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -4985,6 +4995,20 @@ dependencies = [ "yaml-rust", ] +[[package]] +name = "sysinfo" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e847e2de7a137c8c2cede5095872dbb00f4f9bf34d061347e36b43322acd56" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "winapi", +] + [[package]] name = "tar" version = "0.4.38" diff --git a/Cargo.toml b/Cargo.toml index ad2315b5b..5dff498f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ bzip2 = "0.4.4" serde_cbor = "0.11.1" getrandom = "0.2.1" itertools = { version = "0.10.5", optional = true} +sysinfo = { version = "0.28.2", default-features = false } # Async tokio = { version = "1.0", features = ["rt-multi-thread", "signal", "macros"] } diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index 4a3f6a49c..c9f5dd53b 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -1,10 +1,33 @@ -use anyhow::{Context as _, Result}; +use anyhow::{bail, Context as _, Result}; use std::{ - fs, + collections::HashMap, + fs::{self, OpenOptions}, path::{Path, PathBuf}, + time::SystemTime, }; +use sysinfo::{DiskExt, RefreshKind, System, SystemExt}; use tracing::{debug, instrument, warn}; +static LAST_ACCESSED_FILE_NAME: &str = "docsrs_last_accessed"; + +/// gives you the percentage of free disk space on the +/// filesystem where the given `path` lives on. +/// Return value is between 0 and 1. +fn free_disk_space_ratio>(path: P) -> Result { + let sys = System::new_with_specifics(RefreshKind::new().with_disks()); + + let disk_by_mount_point: HashMap<_, _> = + sys.disks().iter().map(|d| (d.mount_point(), d)).collect(); + + let path = path.as_ref(); + + if let Some(disk) = path.ancestors().find_map(|p| disk_by_mount_point.get(p)) { + Ok((disk.available_space() as f64 / disk.total_space() as f64) as f32) + } else { + bail!("could not find mount point for path {}", path.display()); + } +} + /// artifact caching with cleanup #[derive(Debug)] pub(crate) struct ArtifactCache { @@ -23,10 +46,8 @@ impl ArtifactCache { /// clean up a target directory. /// - /// Should delete all things that shouldn't leak between - /// builds, so: - /// - doc-output - /// - ...? + /// Will: + /// * delete the doc output in the root & target directories #[instrument(skip(self))] fn cleanup(&self, target_dir: &Path) -> Result<()> { // proc-macro crates have a `doc` directory @@ -50,6 +71,78 @@ impl ArtifactCache { Ok(()) } + fn cache_dir_for_key(&self, cache_key: &str) -> PathBuf { + self.cache_dir.join(cache_key) + } + + /// update the "last used" marker for the cache key + fn touch(&self, cache_key: &str) -> Result<()> { + let file = self + .cache_dir_for_key(cache_key) + .join(LAST_ACCESSED_FILE_NAME); + + fs::create_dir_all(file.parent().expect("we always have a parent"))?; + if file.exists() { + fs::remove_file(&file)?; + } + OpenOptions::new().create(true).write(true).open(&file)?; + Ok(()) + } + + /// return the list of cache-directories, sorted by last usage. + /// + /// The oldest / least used cache will be first. + /// To be used for cleanup. + /// + /// A missing age-marker file is interpreted as "old age". + fn all_cache_folders_by_age(&self) -> Result> { + let mut entries: Vec<(PathBuf, Option)> = fs::read_dir(&self.cache_dir)? + .filter_map(Result::ok) + .filter_map(|entry| { + let path = entry.path(); + path.is_dir().then(|| { + let last_accessed = path + .join(LAST_ACCESSED_FILE_NAME) + .metadata() + .and_then(|metadata| metadata.modified()) + .ok(); + (path, last_accessed) + }) + }) + .collect(); + + // `None` will appear first after sorting + entries.sort_by_key(|(_, time)| *time); + + Ok(entries.into_iter().map(|(path, _)| path).collect()) + } + + /// free up disk space by deleting the oldest cache folders. + /// + /// Deletes cache folders until the `free_percent_goal` is reached. + pub(crate) fn clear_disk_space(&self, free_percent_goal: f32) -> Result<()> { + let space_ok = + || -> Result { Ok(free_disk_space_ratio(&self.cache_dir)? >= free_percent_goal) }; + + if space_ok()? { + return Ok(()); + } + + for folder in self.all_cache_folders_by_age()? { + warn!( + ?folder, + "freeing up disk space by deleting oldest cache folder" + ); + fs::remove_dir_all(&folder)?; + + if space_ok()? { + break; + } + } + + Ok(()) + } + /// restore a cached target directory. /// /// Will just move the cache folder into the rustwide @@ -67,7 +160,7 @@ impl ArtifactCache { fs::remove_dir_all(target_dir).context("could not clean target directory")?; } - let cache_dir = self.cache_dir.join(cache_key); + let cache_dir = self.cache_dir_for_key(cache_key); if !cache_dir.exists() { // when there is no existing cache dir, // we can just create an empty target. @@ -85,7 +178,7 @@ impl ArtifactCache { cache_key: &str, target_dir: P, ) -> Result<()> { - let cache_dir = self.cache_dir.join(cache_key); + let cache_dir = self.cache_dir_for_key(cache_key); if cache_dir.exists() { fs::remove_dir_all(&cache_dir)?; } @@ -93,6 +186,7 @@ impl ArtifactCache { debug!(?target_dir, ?cache_dir, "saving artifact cache"); fs::rename(&target_dir, &cache_dir).context("could not move target directory to cache")?; self.cleanup(&cache_dir)?; + self.touch(cache_key)?; Ok(()) } } From 4e67646caf83ca10a13df23bcf3863f8d1d152aa Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sat, 11 Mar 2023 18:11:04 +0100 Subject: [PATCH 07/21] update log messages --- src/docbuilder/rustwide_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 4c27be8d8..0e7b96a21 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -581,9 +581,9 @@ impl RustwideBuilder { if let Err(err) = self .artifact_cache .save(&published_by.id.to_string(), build.host_target_dir()) - .context("error giving back artifact cache") + .context("error saving artifact cache") { - warn!(?err, "could not give back artifact cache"); + warn!(?err, "could not save artifact cache"); }; } From b0126a41a639be970c10ace613623b6d7ea410d1 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sat, 11 Mar 2023 20:09:06 +0100 Subject: [PATCH 08/21] extend tests for crate-publisher fetching --- src/index/api.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/index/api.rs b/src/index/api.rs index f6293581f..e3f233171 100644 --- a/src/index/api.rs +++ b/src/index/api.rs @@ -176,7 +176,7 @@ mod tests { use serde_json::{self, json}; fn api() -> Api { - Api::new(Some(Url::parse(&mockito::server_url()).unwrap())).unwrap() + Api::new(Some(Url::parse(&mockito::server_url()).unwrap()), 1).unwrap() } #[test] @@ -190,13 +190,17 @@ mod tests { "id": 1, "login": "the_first_owner", "name": "name", - "avatar": "http://something" + "avatar": "http://something", + "kind": "user", + "url": "https://github.com/the_second_owner" }, { "id": 2, "login": "the_second_owner", "name": "another name", - "avatar": "http://anotherthing" + "avatar": "http://anotherthing", + "kind": "user", + "url": "https://github.com/the_second_owner" } ]})) .unwrap(), From a15fc75d70a481d522084563652282dab07f457e Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sat, 11 Mar 2023 20:10:45 +0100 Subject: [PATCH 09/21] extend tests for crate-release info fetchign --- src/index/api.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/index/api.rs b/src/index/api.rs index e3f233171..df682c297 100644 --- a/src/index/api.rs +++ b/src/index/api.rs @@ -237,6 +237,7 @@ mod tests { "created_at": created.to_rfc3339(), "yanked": true, "downloads": 223, + "license": "MIT", "published_by": { "id": 2, "login": "the_second_owner", @@ -249,6 +250,7 @@ mod tests { "created_at": Utc::now().to_rfc3339(), "yanked": false, "downloads": 333, + "license": "MIT", "published_by": { "id": 1, "login": "owner", From 09588f176c9b83d8a3c8995bf908f7351ffec31a Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 2 Jun 2023 11:07:38 +0200 Subject: [PATCH 10/21] ensure if target directory exists --- src/docbuilder/caching.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index c9f5dd53b..ab8f2161b 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -36,11 +36,21 @@ pub(crate) struct ArtifactCache { impl ArtifactCache { pub(crate) fn new(cache_dir: PathBuf) -> Result { - Ok(Self { cache_dir }) + let cache = Self { cache_dir }; + cache.ensure_cache_exists()?; + Ok(cache) } pub(crate) fn purge(&self) -> Result<()> { fs::remove_dir_all(&self.cache_dir)?; + self.ensure_cache_exists()?; + Ok(()) + } + + fn ensure_cache_exists(&self) -> Result<()> { + if !self.cache_dir.exists() { + fs::create_dir_all(&self.cache_dir)?; + } Ok(()) } @@ -168,6 +178,7 @@ impl ArtifactCache { return Ok(()); } + self.ensure_cache_exists()?; fs::rename(cache_dir, target_dir).context("could not move cache directory to target")?; Ok(()) } @@ -182,6 +193,7 @@ impl ArtifactCache { if cache_dir.exists() { fs::remove_dir_all(&cache_dir)?; } + self.ensure_cache_exists()?; debug!(?target_dir, ?cache_dir, "saving artifact cache"); fs::rename(&target_dir, &cache_dir).context("could not move target directory to cache")?; From 2bc2cab02db248656a97edc7cd12fdcdb387273a Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 2 Jun 2023 16:17:23 +0200 Subject: [PATCH 11/21] add more logging context, try cleaning up target on restore --- src/docbuilder/caching.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index ab8f2161b..d8a68dba5 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -178,8 +178,9 @@ impl ArtifactCache { return Ok(()); } - self.ensure_cache_exists()?; + fs::create_dir_all(target_dir.parent().unwrap())?; fs::rename(cache_dir, target_dir).context("could not move cache directory to target")?; + self.cleanup(target_dir)?; Ok(()) } From c48e0f4b5f42817458ef02b523178a5ab3c2b8b1 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 4 Jun 2023 07:49:30 +0200 Subject: [PATCH 12/21] don't try to create target directory when it already exists --- src/docbuilder/rustwide_builder.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 1e77631a8..be47198ac 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -761,7 +761,12 @@ impl RustwideBuilder { let old_dir = target_dir.join("doc"); let new_dir = target_dir.join(target).join("doc"); debug!("rename {} to {}", old_dir.display(), new_dir.display()); - std::fs::create_dir(target_dir.join(target))?; + { + let parent = new_dir.parent().unwrap(); + if !parent.exists() { + std::fs::create_dir(target_dir.join(target))?; + } + } std::fs::rename(old_dir, new_dir)?; } From ce2bedc94c7a14aa6b3e519744aef3b3e1573545 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 4 Jun 2023 10:59:23 +0200 Subject: [PATCH 13/21] purge artifact cache for tests --- src/docbuilder/rustwide_builder.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index be47198ac..7d604bedc 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -70,8 +70,12 @@ impl RustwideBuilder { }; builder = builder.sandbox_image(image); } + + let artifact_cache = ArtifactCache::new(config.prefix.join("artifact_cache"))?; + if cfg!(test) { builder = builder.fast_init(true); + artifact_cache.purge()?; } let workspace = builder.init().map_err(FailureError::compat)?; @@ -93,7 +97,7 @@ impl RustwideBuilder { Ok(RustwideBuilder { workspace, toolchain, - artifact_cache: ArtifactCache::new(config.prefix.join("artifact_cache"))?, + artifact_cache, config, db: context.pool()?, storage: context.storage()?, From 3758e0d1a06994dca8a9665eb3c41d85f4c496af Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 18 Jun 2023 10:24:20 +0200 Subject: [PATCH 14/21] start adding cache test --- src/docbuilder/rustwide_builder.rs | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 7d604bedc..97a144929 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -34,6 +34,7 @@ const USER_AGENT: &str = "docs.rs builder (https://github.com/rust-lang/docs.rs) const COMPONENTS: &[&str] = &["llvm-tools-preview", "rustc-dev", "rustfmt"]; const DUMMY_CRATE_NAME: &str = "empty-library"; const DUMMY_CRATE_VERSION: &str = "1.0.0"; +const DUMMY_CRATE_PUBLISHER_ID: &str = "2299951"; // pietroalbini #[derive(Debug)] pub enum PackageKind<'a> { @@ -1129,6 +1130,42 @@ mod tests { }) } + #[test] + #[ignore] + fn test_artifact_cache() { + wrapper(|env| { + let crate_ = DUMMY_CRATE_NAME; + let version = DUMMY_CRATE_VERSION; + let expected_cache_dir = env + .config() + .prefix + .join("artifact_cache") + .join(DUMMY_CRATE_PUBLISHER_ID); + + let mut builder = RustwideBuilder::init(env).unwrap(); + + // first build creates the cache + assert!(!expected_cache_dir.exists()); + assert!(builder.build_package(crate_, version, PackageKind::CratesIo)?); + assert!(expected_cache_dir.exists()); + + // cache dir doesn't contain doc output + assert!(!expected_cache_dir.join("doc").exists()); + + // but seems to be a normal cargo target directory + for expected_file in &["CACHEDIR.TAG", "debug"] { + assert!(expected_cache_dir.join(expected_file).exists()); + } + + // do a second build + assert!(builder.build_package(crate_, version, PackageKind::CratesIo)?); + + // FIXME: how would I know if the cache was used? + + Ok(()) + }); + } + #[test] #[ignore] fn test_proc_macro() { From c3f4901ed4be442567f931feefaa0074b0aaa21c Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 18 Jun 2023 11:08:19 +0200 Subject: [PATCH 15/21] add global config to enable artifact cache, debug logging for test --- src/config.rs | 2 ++ src/docbuilder/rustwide_builder.rs | 44 ++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/config.rs b/src/config.rs index 0cf907626..b9ddb2895 100644 --- a/src/config.rs +++ b/src/config.rs @@ -95,6 +95,7 @@ pub struct Config { // Build params pub(crate) build_attempts: u16, + pub(crate) use_build_artifact_cache: bool, pub(crate) rustwide_workspace: PathBuf, pub(crate) inside_docker: bool, pub(crate) docker_image: Option, @@ -130,6 +131,7 @@ impl Config { Ok(Self { build_attempts: env("DOCSRS_BUILD_ATTEMPTS", 5)?, + use_build_artifact_cache: env("DOCSRS_USE_BUILD_ARTIFACT_CACHE", true)?, crates_io_api_call_retries: env("DOCSRS_CRATESIO_API_CALL_RETRIES", 3)?, diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 97a144929..1d37cb266 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -51,7 +51,7 @@ pub struct RustwideBuilder { storage: Arc, metrics: Arc, index: Arc, - artifact_cache: ArtifactCache, + artifact_cache: Option, rustc_version: String, repository_stats_updater: Arc, skip_build_if_exists: bool, @@ -72,11 +72,17 @@ impl RustwideBuilder { builder = builder.sandbox_image(image); } - let artifact_cache = ArtifactCache::new(config.prefix.join("artifact_cache"))?; + let artifact_cache = if config.use_build_artifact_cache { + Some(ArtifactCache::new(config.prefix.join("artifact_cache"))?) + } else { + None + }; if cfg!(test) { builder = builder.fast_init(true); - artifact_cache.purge()?; + if let Some(ref artifact_cache) = artifact_cache { + artifact_cache.purge()?; + } } let workspace = builder.init().map_err(FailureError::compat)?; @@ -209,7 +215,9 @@ impl RustwideBuilder { let has_changed = old_version.as_deref() != Some(&self.rustc_version); if has_changed { - self.artifact_cache.purge()?; + if let Some(ref artifact_cache) = self.artifact_cache { + artifact_cache.purge()?; + } self.add_essential_files()?; } Ok(has_changed) @@ -418,15 +426,16 @@ impl RustwideBuilder { } }; - if let Some(ref published_by) = release_data.published_by { + if let (Some(ref artifact_cache), Some(ref published_by)) = + (&self.artifact_cache, release_data.published_by) + { info!( host_target_dir=?build.host_target_dir(), published_by_id=published_by.id, published_by_login=published_by.login, "restoring artifact cache", ); - if let Err(err) = self - .artifact_cache + if let Err(err) = artifact_cache .restore_to(&published_by.id.to_string(), build.host_target_dir()) { warn!(?err, "could not restore artifact cache"); @@ -587,15 +596,16 @@ impl RustwideBuilder { } } - if let Some(ref published_by) = release_data.published_by { + if let (Some(artifact_cache), Some(ref published_by)) = + (&self.artifact_cache, release_data.published_by) + { info!( host_target_dir=?build.host_target_dir(), published_by_id=published_by.id, published_by_login=published_by.login, "saving artifact cache", ); - if let Err(err) = self - .artifact_cache + if let Err(err) = artifact_cache .save(&published_by.id.to_string(), build.host_target_dir()) .context("error saving artifact cache") { @@ -1147,21 +1157,27 @@ mod tests { // first build creates the cache assert!(!expected_cache_dir.exists()); assert!(builder.build_package(crate_, version, PackageKind::CratesIo)?); + + for chld in std::fs::read_dir(expected_cache_dir.parent().unwrap())? { + dbg!(&chld); + } + assert!(expected_cache_dir.exists()); // cache dir doesn't contain doc output assert!(!expected_cache_dir.join("doc").exists()); - // but seems to be a normal cargo target directory + // but seems to be a normal cargo target directory, + // which also means that `build_package` actually used the + // target directory, and it was moved into the cache afterwards. for expected_file in &["CACHEDIR.TAG", "debug"] { assert!(expected_cache_dir.join(expected_file).exists()); } - // do a second build + // do a second build, + // should not fail assert!(builder.build_package(crate_, version, PackageKind::CratesIo)?); - // FIXME: how would I know if the cache was used? - Ok(()) }); } From 231de62f1d3ec33572a6415dfd9309e61dec579d Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 18 Jun 2023 11:25:13 +0200 Subject: [PATCH 16/21] try other publisher id for the test --- src/docbuilder/rustwide_builder.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 1d37cb266..825e1da70 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -34,7 +34,6 @@ const USER_AGENT: &str = "docs.rs builder (https://github.com/rust-lang/docs.rs) const COMPONENTS: &[&str] = &["llvm-tools-preview", "rustc-dev", "rustfmt"]; const DUMMY_CRATE_NAME: &str = "empty-library"; const DUMMY_CRATE_VERSION: &str = "1.0.0"; -const DUMMY_CRATE_PUBLISHER_ID: &str = "2299951"; // pietroalbini #[derive(Debug)] pub enum PackageKind<'a> { @@ -959,6 +958,8 @@ mod tests { use crate::test::{assert_redirect, assert_success, wrapper}; use serde_json::Value; + const DUMMY_CRATE_PUBLISHER_ID: &str = "4825"; + #[test] #[ignore] fn test_build_crate() { @@ -1167,10 +1168,14 @@ mod tests { // cache dir doesn't contain doc output assert!(!expected_cache_dir.join("doc").exists()); + for chld in std::fs::read_dir(&expected_cache_dir)? { + dbg!(&chld); + } + // but seems to be a normal cargo target directory, // which also means that `build_package` actually used the // target directory, and it was moved into the cache afterwards. - for expected_file in &["CACHEDIR.TAG", "debug"] { + for expected_file in &["docsrs_last_accessed", "debug", ".rustc_info.json"] { assert!(expected_cache_dir.join(expected_file).exists()); } From bb96f3b7f17c87a5e43e7d03415078ef89749ac2 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 18 Jun 2023 18:53:25 +0200 Subject: [PATCH 17/21] remove debug-output from tests --- src/docbuilder/rustwide_builder.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 825e1da70..015d27e3f 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -1158,20 +1158,11 @@ mod tests { // first build creates the cache assert!(!expected_cache_dir.exists()); assert!(builder.build_package(crate_, version, PackageKind::CratesIo)?); - - for chld in std::fs::read_dir(expected_cache_dir.parent().unwrap())? { - dbg!(&chld); - } - assert!(expected_cache_dir.exists()); // cache dir doesn't contain doc output assert!(!expected_cache_dir.join("doc").exists()); - for chld in std::fs::read_dir(&expected_cache_dir)? { - dbg!(&chld); - } - // but seems to be a normal cargo target directory, // which also means that `build_package` actually used the // target directory, and it was moved into the cache afterwards. @@ -1179,8 +1170,7 @@ mod tests { assert!(expected_cache_dir.join(expected_file).exists()); } - // do a second build, - // should not fail + // do a second build, should not fail assert!(builder.build_package(crate_, version, PackageKind::CratesIo)?); Ok(()) From ce370b1e9fb387484bfe368caa8f778dfeb1a1ee Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 18 Jun 2023 21:20:30 +0200 Subject: [PATCH 18/21] start adding caching tests --- src/docbuilder/caching.rs | 124 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index d8a68dba5..42d0c5569 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -203,3 +203,127 @@ impl ArtifactCache { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_creates_cache_dir() { + let parent = tempfile::tempdir().unwrap(); + + let cache_dir = parent.path().join("cache"); + assert!(!cache_dir.exists()); + ArtifactCache::new(cache_dir.clone()).unwrap(); + assert!(cache_dir.exists()); + } + + #[test] + fn test_purge() { + let cache_dir = tempfile::tempdir().unwrap(); + let cache_path = cache_dir.path(); + let cache = ArtifactCache::new(cache_path.to_owned()).unwrap(); + + // put something into the cache + for key in &["1", "2", "3"] { + cache.touch(key).unwrap(); + } + + assert_eq!(fs::read_dir(cache_path).unwrap().count(), 3); + + cache.purge().unwrap(); + assert_eq!(fs::read_dir(cache_path).unwrap().count(), 0); + assert!(cache_path.exists()); + } + + #[test] + fn test_save_restore() { + let cache_dir = tempfile::tempdir().unwrap(); + let cache_path = cache_dir.path(); + let cache = ArtifactCache::new(cache_path.to_owned()).unwrap(); + + // create a dummy target directory + let target_dir = tempfile::tempdir().unwrap(); + let target_path = target_dir.path(); + + let linux_gnu = target_path.join("linux-gnu"); + fs::create_dir_all(&linux_gnu).unwrap(); + + // docs will be deleted while saving + let docs = target_path.join("doc"); + fs::create_dir(&docs).unwrap(); + fs::write(docs.join("1.html"), b"content").unwrap(); + let target_docs = linux_gnu.join("doc"); + fs::create_dir(&target_docs).unwrap(); + fs::write(target_docs.join("2.html"), b"content").unwrap(); + + // other files in the directory will be kept + fs::write(target_path.join("file1"), b"content").unwrap(); + fs::write(linux_gnu.join("file2"), b"content").unwrap(); + + cache.save("cache_key", target_path).unwrap(); + + // target is gone + assert!(!target_path.exists()); + + let saved_cache = cache_path.join("cache_key"); + // doc output is gone + assert!(!saved_cache.join("doc").exists()); + assert!(!saved_cache.join("linux-gnu").join("doc").exists()); + // the rest is there + assert!(saved_cache.join("file1").exists()); + assert!(saved_cache.join("linux-gnu").join("file2").exists()); + + cache.restore_to("cache_key", target_path).unwrap(); + // target is back + assert!(target_path.exists()); + assert!(target_path.join("file1").exists()); + assert!(target_path.join("linux-gnu").join("file2").exists()); + // cache is gone + assert!(!saved_cache.exists()); + } + + #[test] + fn test_target_doesnt_exist() { + let cache_dir = tempfile::tempdir().unwrap(); + let cache = ArtifactCache::new(cache_dir.path().to_owned()).unwrap(); + + let parent = tempfile::tempdir().unwrap(); + + let potential_target = parent.path().join("target"); + cache.restore_to("cache_key", &potential_target).unwrap(); + + assert!(potential_target.exists()); + } + + #[test] + fn test_source_doesnt_exist_errors() { + let cache_dir = tempfile::tempdir().unwrap(); + let cache = ArtifactCache::new(cache_dir.path().to_owned()).unwrap(); + + let parent = tempfile::tempdir().unwrap(); + + let non_existing_source = parent.path().join("source"); + assert!(cache.save("cache_key", non_existing_source).is_err()); + } + + #[test] + fn test_clean_disk_space() { + let cache_dir = tempfile::tempdir().unwrap(); + let cache_path = cache_dir.path(); + let cache = ArtifactCache::new(cache_path.to_owned()).unwrap(); + + // put something into the cache + for key in &["1", "2", "3"] { + cache.touch(key).unwrap(); + } + + assert_eq!(fs::read_dir(cache_path).unwrap().count(), 3); + + // will clean everything + cache.clear_disk_space(100.0).unwrap(); + + assert_eq!(fs::read_dir(cache_path).unwrap().count(), 0); + assert!(cache_path.exists()); + } +} From a11b75bc5715f769ab41f347021f8547ec8b83db Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 18 Jun 2023 21:38:25 +0200 Subject: [PATCH 19/21] fix cache dir cleanup for free disk space, add test --- src/docbuilder/caching.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/docbuilder/caching.rs b/src/docbuilder/caching.rs index 42d0c5569..af705fddd 100644 --- a/src/docbuilder/caching.rs +++ b/src/docbuilder/caching.rs @@ -14,7 +14,7 @@ static LAST_ACCESSED_FILE_NAME: &str = "docsrs_last_accessed"; /// filesystem where the given `path` lives on. /// Return value is between 0 and 1. fn free_disk_space_ratio>(path: P) -> Result { - let sys = System::new_with_specifics(RefreshKind::new().with_disks()); + let sys = System::new_with_specifics(RefreshKind::new().with_disks().with_disks_list()); let disk_by_mount_point: HashMap<_, _> = sys.disks().iter().map(|d| (d.mount_point(), d)).collect(); @@ -326,4 +326,24 @@ mod tests { assert_eq!(fs::read_dir(cache_path).unwrap().count(), 0); assert!(cache_path.exists()); } + + #[test] + fn test_dont_clean_disk_space() { + let cache_dir = tempfile::tempdir().unwrap(); + let cache_path = cache_dir.path(); + let cache = ArtifactCache::new(cache_path.to_owned()).unwrap(); + + // put something into the cache + for key in &["1", "2", "3"] { + cache.touch(key).unwrap(); + } + + assert_eq!(fs::read_dir(cache_path).unwrap().count(), 3); + + // will clean nothing + cache.clear_disk_space(0.0).unwrap(); + + assert_eq!(fs::read_dir(cache_path).unwrap().count(), 3); + assert!(cache_path.exists()); + } } From 184a00d1ab626a33bfa06be3612b684fae9e92a3 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Mon, 19 Jun 2023 07:57:45 +0200 Subject: [PATCH 20/21] start freeing disk-space --- src/config.rs | 4 ++++ src/docbuilder/rustwide_builder.rs | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/src/config.rs b/src/config.rs index b9ddb2895..5c11ffd1e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -93,6 +93,10 @@ pub struct Config { /// same for the `static.docs.rs` distribution pub cloudfront_distribution_id_static: Option, + // some caches do automatic cleanup based on this free disk space goal. + // Value shoudl be between 0 and 1. + pub(crate) free_disk_space_goal: float, + // Build params pub(crate) build_attempts: u16, pub(crate) use_build_artifact_cache: bool, diff --git a/src/docbuilder/rustwide_builder.rs b/src/docbuilder/rustwide_builder.rs index 015d27e3f..eaaf0b930 100644 --- a/src/docbuilder/rustwide_builder.rs +++ b/src/docbuilder/rustwide_builder.rs @@ -610,6 +610,13 @@ impl RustwideBuilder { { warn!(?err, "could not save artifact cache"); }; + + if let Err(err) = artifact_cache + .clear_disk_space(self.config.free_disk_space_goal) + .context("error freeing disk space on artifact cache") + { + warn!(?err, "could not clear disk space"); + } } if res.result.successful { From 70771d51d4d6606913bd5dabc40c0f77cd582837 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Mon, 19 Jun 2023 21:13:07 +0200 Subject: [PATCH 21/21] add config var for free disk space goal --- src/config.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index 5c11ffd1e..f880be602 100644 --- a/src/config.rs +++ b/src/config.rs @@ -95,7 +95,7 @@ pub struct Config { // some caches do automatic cleanup based on this free disk space goal. // Value shoudl be between 0 and 1. - pub(crate) free_disk_space_goal: float, + pub(crate) free_disk_space_goal: f32, // Build params pub(crate) build_attempts: u16, @@ -134,6 +134,8 @@ impl Config { let prefix: PathBuf = require_env("DOCSRS_PREFIX")?; Ok(Self { + free_disk_space_goal: env("DOCSRS_FREE_DISK_SPACE_GOAL", 0.2)?, + build_attempts: env("DOCSRS_BUILD_ATTEMPTS", 5)?, use_build_artifact_cache: env("DOCSRS_USE_BUILD_ARTIFACT_CACHE", true)?,